From df0651ebabc1f28bf8df5d42e04c701e86917714 Mon Sep 17 00:00:00 2001 From: Joel Wurtz Date: Tue, 3 Sep 2024 10:32:05 +0200 Subject: [PATCH] feat(utf8): use a feature instead, add a specific error if utf8 is invalid --- Cargo.toml | 3 ++- benches/parse.rs | 2 +- src/lib.rs | 59 ++++++++++++++++++++++++++++++--------------- src/simd/avx2.rs | 18 ++++++-------- src/simd/runtime.rs | 8 +++--- src/simd/sse42.rs | 18 ++++++-------- src/simd/swar.rs | 6 ++--- 7 files changed, 66 insertions(+), 48 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8155dd2..56ec40e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,8 @@ edition = "2018" build = "build.rs" [features] -default = ["std"] +default = ["std", "utf8_in_path"] +utf8_in_path = [] std = [] [dev-dependencies] diff --git a/benches/parse.rs b/benches/parse.rs index d3d9c9d..97a6204 100644 --- a/benches/parse.rs +++ b/benches/parse.rs @@ -113,7 +113,7 @@ fn uri(c: &mut Criterion) { .throughput(Throughput::Bytes(input.len() as u64)) .bench_function(name, |b| b.iter(|| { let mut b = httparse::_benchable::Bytes::new(black_box(input)); - httparse::_benchable::parse_uri(&mut b, false).unwrap() + httparse::_benchable::parse_uri(&mut b).unwrap() })); } diff --git a/src/lib.rs b/src/lib.rs index bc2f32a..7d1b951 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -62,6 +62,7 @@ fn is_token(b: u8) -> bool { // ASCII codes to accept URI string. // i.e. A-Z a-z 0-9 !#$%&'*+-._();:@=,/?[]~^ // TODO: Make a stricter checking for URI string? +#[cfg(not(feature = "utf8_in_path"))] static URI_MAP: [bool; 256] = byte_map![ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // \0 \n @@ -90,7 +91,8 @@ static URI_MAP: [bool; 256] = byte_map![ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; -static URI_NON_COMPLIANT_MAP: [bool; 256] = byte_map![ +#[cfg(feature = "utf8_in_path")] +static URI_MAP: [bool; 256] = byte_map![ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -110,12 +112,8 @@ static URI_NON_COMPLIANT_MAP: [bool; 256] = byte_map![ ]; #[inline] -pub(crate) fn is_uri_token(b: u8, allow_non_compliant: bool) -> bool { - if allow_non_compliant { - URI_NON_COMPLIANT_MAP[b as usize] - } else { - URI_MAP[b as usize] - } +pub(crate) fn is_uri_token(b: u8) -> bool { + URI_MAP[b as usize] } static HEADER_NAME_MAP: [bool; 256] = byte_map![ @@ -184,6 +182,9 @@ pub enum Error { TooManyHeaders, /// Invalid byte in HTTP version. Version, + #[cfg(feature = "utf8_in_path")] + /// Invalid UTF-8 in path. + Utf8Error, } impl Error { @@ -197,6 +198,8 @@ impl Error { Error::Token => "invalid token", Error::TooManyHeaders => "too many headers", Error::Version => "invalid HTTP version", + #[cfg(feature = "utf8_in_path")] + Error::Utf8Error => "invalid UTF-8 in path", } } } @@ -283,7 +286,6 @@ pub struct ParserConfig { allow_multiple_spaces_in_request_line_delimiters: bool, allow_multiple_spaces_in_response_status_delimiters: bool, allow_space_before_first_header_name: bool, - allow_rfc3986_non_compliant_path: bool, ignore_invalid_headers_in_responses: bool, ignore_invalid_headers_in_requests: bool, } @@ -563,7 +565,7 @@ impl<'h, 'b> Request<'h, 'b> { if config.allow_multiple_spaces_in_request_line_delimiters { complete!(skip_spaces(&mut bytes)); } - self.path = Some(complete!(parse_uri(&mut bytes, config.allow_rfc3986_non_compliant_path))); + self.path = Some(complete!(parse_uri(&mut bytes))); if config.allow_multiple_spaces_in_request_line_delimiters { complete!(skip_spaces(&mut bytes)); } @@ -976,9 +978,9 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> { #[doc(hidden)] #[allow(missing_docs)] // WARNING: Exported for internal benchmarks, not fit for public consumption -pub fn parse_uri<'a>(bytes: &mut Bytes<'a>, allow_non_compliant: bool) -> Result<&'a str> { +pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> { let start = bytes.pos(); - simd::match_uri_vectored(bytes, allow_non_compliant); + simd::match_uri_vectored(bytes); let end = bytes.pos(); if next!(bytes) == b' ' { @@ -987,6 +989,14 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>, allow_non_compliant: bool) -> Result return Err(Error::Token); } + #[cfg(feature = "utf8_in_path")] + // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8. + return match str::from_utf8(unsafe { bytes.slice_skip(1) }) { + Ok(uri) => Ok(Status::Complete(uri)), + Err(_) => Err(Error::Utf8Error), + }; + + #[cfg(not(feature = "utf8_in_path"))] return Ok(Status::Complete( // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8. unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) }, @@ -2077,7 +2087,7 @@ mod tests { assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"567xf8a\r\n"), Err(crate::InvalidChunkSize)); - assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, std::u64::MAX)))); + assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, u64::MAX)))); assert_eq!(parse_chunk_size(b"1ffffffffffffffff\r\n"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"Affffffffffffffff\r\n"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"fffffffffffffffff\r\n"), Err(crate::InvalidChunkSize)); @@ -2185,7 +2195,7 @@ mod tests { assert_eq!(result, Err(crate::Error::Token)); } - static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo>ohno HTTP/1.1\r\n\r\n"; + static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo ohno HTTP/1.1\r\n\r\n"; #[test] fn test_request_with_multiple_spaces_and_bad_path() { @@ -2194,7 +2204,7 @@ mod tests { let result = crate::ParserConfig::default() .allow_multiple_spaces_in_request_line_delimiters(true) .parse_request(&mut request, REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH); - assert_eq!(result, Err(crate::Error::Token)); + assert_eq!(result, Err(crate::Error::Version)); } static RESPONSE_WITH_SPACES_IN_CODE: &[u8] = b"HTTP/1.1 99 200 OK\r\n\r\n"; @@ -2702,7 +2712,8 @@ mod tests { } #[test] - fn test_rfc3986_non_compliant_path_ko() { + #[cfg(not(feature = "utf8_in_path"))] + fn test_utf8_in_path_ko() { let mut headers = [EMPTY_HEADER; 1]; let mut request = Request::new(&mut headers[..]); @@ -2712,13 +2723,12 @@ mod tests { } #[test] - fn test_rfc3986_non_compliant_path_ok() { + #[cfg(feature = "utf8_in_path")] + fn test_utf8_in_path_ok() { let mut headers = [EMPTY_HEADER; 1]; let mut request = Request::new(&mut headers[..]); - let mut config = crate::ParserConfig::default(); - config.allow_rfc3986_non_compliant_path = true; - let result = config.parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n"); + let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n"); assert_eq!(result, Ok(Status::Complete(67))); assert_eq!(request.version.unwrap(), 1); @@ -2728,4 +2738,15 @@ mod tests { assert_eq!(request.headers[0].name, "Host"); assert_eq!(request.headers[0].value, &b"example.org"[..]); } + + #[test] + #[cfg(feature = "utf8_in_path")] + fn test_bad_utf8_in_path() { + let mut headers = [EMPTY_HEADER; 1]; + let mut request = Request::new(&mut headers[..]); + + let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n"); + + assert_eq!(result, Err(crate::Error::Utf8Error)); + } } diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs index 57e584d..aa564c0 100644 --- a/src/simd/avx2.rs +++ b/src/simd/avx2.rs @@ -2,14 +2,10 @@ use crate::iter::Bytes; #[inline] #[target_feature(enable = "avx2")] -pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { +pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { while bytes.as_ref().len() >= 32 { - let advance = if allow_non_compliant { - match_url_char_non_compliant_32_avx(bytes.as_ref()) - } else { - match_url_char_32_avx(bytes.as_ref()) - }; + let advance = match_url_char_32_avx(bytes.as_ref()); bytes.advance(advance); @@ -18,9 +14,10 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { } } // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2 - super::swar::match_uri_vectored(bytes, allow_non_compliant) + super::swar::match_uri_vectored(bytes) } +#[cfg(not(feature = "utf8_in_path"))] #[inline(always)] #[allow(non_snake_case, overflowing_literals)] #[allow(unused)] @@ -62,10 +59,11 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize { r.trailing_zeros() as usize } +#[cfg(feature = "utf8_in_path")] #[inline(always)] #[allow(non_snake_case, overflowing_literals)] #[allow(unused)] -unsafe fn match_url_char_non_compliant_32_avx(buf: &[u8]) -> usize { +unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize { debug_assert!(buf.len() >= 32); #[cfg(target_arch = "x86")] @@ -140,11 +138,11 @@ fn avx2_code_matches_uri_chars_table() { #[allow(clippy::undocumented_unsafe_blocks)] unsafe { - assert!(byte_is_allowed(b'_', |b| match_uri_vectored(b, false))); + assert!(byte_is_allowed(b'_', match_uri_vectored)); for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() { assert_eq!( - byte_is_allowed(b as u8, |b| match_uri_vectored(b, false)), allowed, + byte_is_allowed(b as u8, match_uri_vectored), allowed, "byte_is_allowed({:?}) should be {:?}", b, allowed, ); } diff --git a/src/simd/runtime.rs b/src/simd/runtime.rs index 20f6017..c523a92 100644 --- a/src/simd/runtime.rs +++ b/src/simd/runtime.rs @@ -34,13 +34,13 @@ pub fn match_header_name_vectored(bytes: &mut Bytes) { super::swar::match_header_name_vectored(bytes); } -pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { +pub fn match_uri_vectored(bytes: &mut Bytes) { // SAFETY: calls are guarded by a feature check unsafe { match get_runtime_feature() { - AVX2 => avx2::match_uri_vectored(bytes, allow_non_compliant), - SSE42 => sse42::match_uri_vectored(bytes, allow_non_compliant), - _ /* NOP */ => super::swar::match_uri_vectored(bytes, allow_non_compliant), + AVX2 => avx2::match_uri_vectored(bytes), + SSE42 => sse42::match_uri_vectored(bytes), + _ /* NOP */ => super::swar::match_uri_vectored(bytes), } } } diff --git a/src/simd/sse42.rs b/src/simd/sse42.rs index 4ab8e00..a5ff8df 100644 --- a/src/simd/sse42.rs +++ b/src/simd/sse42.rs @@ -1,13 +1,9 @@ use crate::iter::Bytes; #[target_feature(enable = "sse4.2")] -pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { +pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { while bytes.as_ref().len() >= 16 { - let advance = if allow_non_compliant { - match_url_char_non_compliant_16_sse(bytes.as_ref()) - } else { - match_url_char_16_sse(bytes.as_ref()) - }; + let advance = match_url_char_16_sse(bytes.as_ref()); bytes.advance(advance); @@ -15,9 +11,10 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { return; } } - super::swar::match_uri_vectored(bytes, allow_non_compliant); + super::swar::match_uri_vectored(bytes); } +#[cfg(not(feature = "utf8_in_path"))] #[inline(always)] #[allow(non_snake_case, overflowing_literals)] unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize { @@ -66,9 +63,10 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize { r.trailing_zeros() as usize } +#[cfg(feature = "utf8_in_path")] #[inline(always)] #[allow(non_snake_case)] -unsafe fn match_url_char_non_compliant_16_sse(buf: &[u8]) -> usize { +unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize { debug_assert!(buf.len() >= 16); #[cfg(target_arch = "x86")] @@ -143,11 +141,11 @@ fn sse_code_matches_uri_chars_table() { #[allow(clippy::undocumented_unsafe_blocks)] unsafe { - assert!(byte_is_allowed(b'_', |b| match_uri_vectored(b, false))); + assert!(byte_is_allowed(b'_', match_uri_vectored)); for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() { assert_eq!( - byte_is_allowed(b as u8, |b| match_uri_vectored(b, false)), allowed, + byte_is_allowed(b as u8, match_uri_vectored), allowed, "byte_is_allowed({:?}) should be {:?}", b, allowed, ); } diff --git a/src/simd/swar.rs b/src/simd/swar.rs index d745318..5925d62 100644 --- a/src/simd/swar.rs +++ b/src/simd/swar.rs @@ -7,7 +7,7 @@ const BLOCK_SIZE: usize = core::mem::size_of::(); type ByteBlock = [u8; BLOCK_SIZE]; #[inline] -pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { +pub fn match_uri_vectored(bytes: &mut Bytes) { loop { if let Some(bytes8) = bytes.peek_n::(BLOCK_SIZE) { let n = match_uri_char_8_swar(bytes8); @@ -21,7 +21,7 @@ pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) { } } if let Some(b) = bytes.peek() { - if is_uri_token(b, allow_non_compliant) { + if is_uri_token(b) { // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte // in bytes, so calling advance is safe. unsafe { @@ -106,7 +106,7 @@ fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize { // A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44) // creates a u64 whose bytes are each equal to b const fn uniform_block(b: u8) -> usize { - (b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize + (b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize } // A byte-wise range-check on an enire word/block,