Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(path): allow utf8 chars in path #178

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 160 additions & 26 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,35 +59,26 @@ fn is_token(b: u8) -> bool {
b > 0x1F && b < 0x7F
}

// ASCII codes to accept URI string.
// i.e. A-Z a-z 0-9 !#$%&'*+-._();:@=,/?[]~^
// char codes to accept URI string.
// i.e. b'!' <= char and char != 127
// TODO: Make a stricter checking for URI string?
static URI_MAP: [bool; 256] = byte_map![
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// \0 \n
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// commands
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// \w ! " # $ % & ' ( ) * + , - . /
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// @ A B C D E F G H I J K L M N O
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// P Q R S T U V W X Y Z [ \ ] ^ _
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// ` a b c d e f g h i j k l m n o
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
// p q r s t u v w x y z { | } ~ del
// ====== Extended ASCII (aka. obs-text) ======
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
];

#[inline]
Expand Down Expand Up @@ -963,10 +954,11 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
return Err(Error::Token);
}

return Ok(Status::Complete(
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
));
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
match str::from_utf8(unsafe { bytes.slice_skip(1) }) {
Ok(uri) => Ok(Status::Complete(uri)),
Err(_) => Err(Error::Token),
}
} else {
Err(Error::Token)
}
Expand Down Expand Up @@ -2053,7 +2045,7 @@ mod tests {
assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize));
assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize));
assert_eq!(parse_chunk_size(b"567xf8a\r\n"), Err(crate::InvalidChunkSize));
assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, std::u64::MAX))));
assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, u64::MAX))));
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated, can remove it, but without this test without default features would fail (since we don't have std)

assert_eq!(parse_chunk_size(b"1ffffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
assert_eq!(parse_chunk_size(b"Affffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
assert_eq!(parse_chunk_size(b"fffffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
Expand Down Expand Up @@ -2161,7 +2153,7 @@ mod tests {
assert_eq!(result, Err(crate::Error::Token));
}

static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo>ohno HTTP/1.1\r\n\r\n";
static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo ohno HTTP/1.1\r\n\r\n";
joelwurtz marked this conversation as resolved.
Show resolved Hide resolved

#[test]
fn test_request_with_multiple_spaces_and_bad_path() {
Expand All @@ -2170,9 +2162,125 @@ mod tests {
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH);
assert_eq!(result, Err(crate::Error::Version));
}

// This test ensure there is an error when there is a DEL character in the path
// since we allow all char from 0x21 code except DEL, this test ensure that DEL
// is not allowed in the path
static REQUEST_WITH_DEL_IN_PATH: &[u8] = b"GET /foo\x7Fohno HTTP/1.1\r\n\r\n";

#[test]
fn test_request_with_del_in_path() {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, crate::tests::REQUEST_WITH_DEL_IN_PATH);
assert_eq!(result, Err(crate::Error::Token));
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow for this test
fn test_all_utf8_char_in_paths() {
// two code points
for i in 128..256 {
for j in 128..256 {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let bytes = [i as u8, j as u8];

match core::str::from_utf8(&bytes) {
Ok(s) => {
let first_line = format!("GET /{} HTTP/1.1\r\n\r\n", s);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_bytes());

assert_eq!(result, Ok(Status::Complete(20)), "failed for utf8 char i: {}, j: {}", i, j);
},
Err(_) => {
let mut first_line = b"GET /".to_vec();
first_line.extend(&bytes);
first_line.extend(b" HTTP/1.1\r\n\r\n");

let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_slice());

assert_eq!(result, Err(crate::Error::Token), "failed for utf8 char i: {}, j: {}", i, j);
},
};

// three code points starting from 0xe0
if i < 0xe0 {
continue;
}

for k in 128..256 {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let bytes = [i as u8, j as u8, k as u8];

match core::str::from_utf8(&bytes) {
Ok(s) => {
let first_line = format!("GET /{} HTTP/1.1\r\n\r\n", s);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_bytes());

assert_eq!(result, Ok(Status::Complete(21)), "failed for utf8 char i: {}, j: {}, k: {}", i, j, k);
},
Err(_) => {
let mut first_line = b"GET /".to_vec();
first_line.extend(&bytes);
first_line.extend(b" HTTP/1.1\r\n\r\n");

let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_slice());

assert_eq!(result, Err(crate::Error::Token), "failed for utf8 char i: {}, j: {}, k: {}", i, j, k);
},
};

// four code points starting from 0xf0
if i < 0xf0 {
continue;
}

for l in 128..256 {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let bytes = [i as u8, j as u8, k as u8, l as u8];

match core::str::from_utf8(&bytes) {
Ok(s) => {
let first_line = format!("GET /{} HTTP/1.1\r\n\r\n", s);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_bytes());

assert_eq!(result, Ok(Status::Complete(22)), "failed for utf8 char i: {}, j: {}, k: {}, l: {}", i, j, k, l);
},
Err(_) => {
let mut first_line = b"GET /".to_vec();
first_line.extend(&bytes);
first_line.extend(b" HTTP/1.1\r\n\r\n");

let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_slice());

assert_eq!(result, Err(crate::Error::Token), "failed for utf8 char i: {}, j: {}, k: {}, l: {}", i, j, k, l);
},
};
}
}
}
}
}

static RESPONSE_WITH_SPACES_IN_CODE: &[u8] = b"HTTP/1.1 99 200 OK\r\n\r\n";

#[test]
Expand Down Expand Up @@ -2676,4 +2784,30 @@ mod tests {
assert_eq!(response.headers[0].name, "foo");
assert_eq!(response.headers[0].value, &b"bar"[..]);
}

#[test]
fn test_utf8_in_path_ok() {
let mut headers = [EMPTY_HEADER; 1];
let mut request = Request::new(&mut headers[..]);

let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");

assert_eq!(result, Ok(Status::Complete(67)));
assert_eq!(request.version.unwrap(), 1);
assert_eq!(request.method.unwrap(), "GET");
assert_eq!(request.path.unwrap(), "/test?post=I’msorryIforkedyou");
assert_eq!(request.headers.len(), 1);
assert_eq!(request.headers[0].name, "Host");
assert_eq!(request.headers[0].value, &b"example.org"[..]);
}

#[test]
fn test_bad_utf8_in_path() {
let mut headers = [EMPTY_HEADER; 1];
let mut request = Request::new(&mut headers[..]);

let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");

assert_eq!(result, Err(crate::Error::Token));
}
}
38 changes: 13 additions & 25 deletions src/simd/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use crate::iter::Bytes;
#[target_feature(enable = "avx2")]
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
while bytes.as_ref().len() >= 32 {

let advance = match_url_char_32_avx(bytes.as_ref());

bytes.advance(advance);

if advance != 32 {
Expand All @@ -28,32 +30,18 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {

let ptr = buf.as_ptr();

let LSH: __m256i = _mm256_set1_epi8(0x0f);

// See comment in sse42::match_url_char_16_sse.

let URI: __m256i = _mm256_setr_epi8(
0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
);
let ARF: __m256i = _mm256_setr_epi8(
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);

let data = _mm256_lddqu_si256(ptr as *const _);
let rbms = _mm256_shuffle_epi8(URI, data);
let cols = _mm256_and_si256(LSH, _mm256_srli_epi16(data, 4));
let bits = _mm256_and_si256(_mm256_shuffle_epi8(ARF, cols), rbms);

let v = _mm256_cmpeq_epi8(bits, _mm256_setzero_si256());
let r = _mm256_movemask_epi8(v) as u32;
// %x21-%x7e %x80-%xff
let DEL: __m256i = _mm256_set1_epi8(0x7f);
let LOW: __m256i = _mm256_set1_epi8(0x21);

r.trailing_zeros() as usize
let dat = _mm256_lddqu_si256(ptr as *const _);
// unsigned comparison dat >= LOW
let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
let del = _mm256_cmpeq_epi8(dat, DEL);
let bit = _mm256_andnot_si256(del, low);
let res = _mm256_movemask_epi8(bit) as u32;
// TODO: use .trailing_ones() once MSRV >= 1.46
(!res).trailing_zeros() as usize
}

#[target_feature(enable = "avx2")]
Expand Down
17 changes: 6 additions & 11 deletions src/simd/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,12 @@ unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
let input = vld1q_u8(ptr);

// Check that b'!' <= input <= b'~'
let result = vandq_u8(
vcleq_u8(vdupq_n_u8(b'!'), input),
vcleq_u8(input, vdupq_n_u8(b'~')),
);
// Check that input != b'<' and input != b'>'
let lt = vceqq_u8(input, vdupq_n_u8(b'<'));
let gt = vceqq_u8(input, vdupq_n_u8(b'>'));
let ltgt = vorrq_u8(lt, gt);
// Nand with result
let result = vbicq_u8(result, ltgt);
// Check that b'!' <= and b != 127
let result = vcleq_u8(vdupq_n_u8(b'!'), input);

// Disallow del
let del = vceqq_u8(input, vdupq_n_u8(0x7F));
let result = vbicq_u8(result, del);

offsetz(result) as usize
}
Expand Down
50 changes: 15 additions & 35 deletions src/simd/sse42.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use crate::iter::Bytes;
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
while bytes.as_ref().len() >= 16 {
let advance = match_url_char_16_sse(bytes.as_ref());

bytes.advance(advance);

if advance != 16 {
Expand All @@ -14,7 +15,7 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
}

#[inline(always)]
#[allow(non_snake_case, overflowing_literals)]
#[allow(non_snake_case)]
unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
debug_assert!(buf.len() >= 16);

Expand All @@ -25,40 +26,19 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {

let ptr = buf.as_ptr();

let LSH: __m128i = _mm_set1_epi8(0x0f);

// The first 0xf8 corresponds to the 8 first rows of the first column
// of URI_MAP in the crate's root, with the first row corresponding to bit 0
// and the 8th row corresponding to bit 7.
// The 8 first rows give 0 0 0 1 1 1 1 1, which is 0xf8 (with least
// significant digit on the left).
//
// Another example just to drive the point home: in column 15, '>' is
// rejected, so the values are 0 0 1 0 1 1 1 1, which gives us 0xf4.
//
// Thanks to Vlad Krasnov for explaining this stuff to us mere mortals in
// a GitHub comment!
//
// https://github.com/seanmonstar/httparse/pull/89#issuecomment-807039219

let URI: __m128i = _mm_setr_epi8(
0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
);
let ARF: __m128i = _mm_setr_epi8(
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);

let data = _mm_lddqu_si128(ptr as *const _);
let rbms = _mm_shuffle_epi8(URI, data);
let cols = _mm_and_si128(LSH, _mm_srli_epi16(data, 4));
let bits = _mm_and_si128(_mm_shuffle_epi8(ARF, cols), rbms);

let v = _mm_cmpeq_epi8(bits, _mm_setzero_si128());
let r = _mm_movemask_epi8(v) as u16;

r.trailing_zeros() as usize
// %x21-%x7e %x80-%xff
let DEL: __m128i = _mm_set1_epi8(0x7f);
let LOW: __m128i = _mm_set1_epi8(0x21);

let dat = _mm_lddqu_si128(ptr as *const _);
// unsigned comparison dat >= LOW
let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat);
let del = _mm_cmpeq_epi8(dat, DEL);
let bit = _mm_andnot_si128(del, low);
let res = _mm_movemask_epi8(bit) as u16;

// TODO: use .trailing_ones() once MSRV >= 1.46
(!res).trailing_zeros() as usize
}

#[target_feature(enable = "sse4.2")]
Expand Down
2 changes: 1 addition & 1 deletion src/simd/swar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
// creates a u64 whose bytes are each equal to b
const fn uniform_block(b: u8) -> usize {
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
}

// A byte-wise range-check on an enire word/block,
Expand Down
Loading