From 1cdea623101ab0e996144b6b4b48cc91b1718370 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 2 Apr 2022 21:14:52 -0400 Subject: [PATCH] Properly normalize attribute values closes #371 --- src/events/attributes.rs | 128 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 1 deletion(-) diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d5aa7db2..165e3996 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -331,6 +331,95 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { } } +// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way. +// 2) Begin with a normalized value consisting of the empty string. +// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following: +// * For a character reference, append the referenced character to the normalized value. +// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity. +// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value. +// * For another character, append the character to the normalized value. +// +// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters, +// and by replacing sequences of space (#x20) characters by a single space (#x20) character. +// +// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced +// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a +// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a +// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value. +fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> { + // TODO: character references, entity references, error handling associated with those + // TODO: don't allocated unless needed? + + #[derive(PartialEq)] + enum ParseState { + SpaceOrStart, + CDATA, + } + + let mut value: Vec = Vec::new(); + // Starting in the state where we think we've added a space means we implicitly skip leading spaces + let mut current_state = ParseState::SpaceOrStart; + // Used for trimming trailing spaces + let mut last_cdata_idx = 0; + + // In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one + for ch in attr.as_ref() { + match ch { + b'\n' | b'\r' | b'\t' | b' ' => match current_state { + ParseState::SpaceOrStart => continue, + ParseState::CDATA => { + current_state = ParseState::SpaceOrStart; + value.push(b' '); + } + }, + c @ _ => match current_state { + ParseState::SpaceOrStart => { + current_state = ParseState::CDATA; + last_cdata_idx = value.len(); + value.push(*c); + } + ParseState::CDATA => { + last_cdata_idx = value.len(); + value.push(*c); + } + }, + } + } + + // Trim any trailing spaces + if current_state == ParseState::SpaceOrStart { + value.truncate(last_cdata_idx + 1); + } + + Cow::Owned(value) + + // let mut value: Vec = Vec::new(); + + // // TODO: replace sequences of spaces + // for i in 0..attr.len() { + // let ch = attr[i]; + // match ch { + // b'\n' => value.push(b' '), + // b'\r' => value.push(b' '), + // b'\t' => value.push(b' '), + // c @ _ => value.push(c), + // } + // } + + // // Position where value starts after whitespace. + // let first_non_space_char = value + // .iter() + // .position(|c| !c.is_ascii_whitespace()) + // .unwrap_or(0); + // // Position where the trailing whitespace starts. + // let last_non_space_char = value + // .iter() + // .rposition(|c| !c.is_ascii_whitespace()) + // .and_then(|idx| Some(idx + 1)) + // .unwrap_or(0); + // Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec()) +} + impl<'a> Iterator for Attributes<'a> { type Item = Result>; fn next(&mut self) -> Option { @@ -355,7 +444,7 @@ impl<'a> Iterator for Attributes<'a> { ($key:expr, $val:expr) => { Some(Ok(Attribute { key: &self.bytes[$key], - value: Cow::Borrowed(&self.bytes[$val]), + value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])), })) }; } @@ -513,4 +602,41 @@ mod tests { assert_eq!(&*a.value, b"ee"); assert!(attributes.next().is_none()); } + + #[test] + fn attribute_value_normalization() { + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character + assert_eq!( + normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(), + b"foo bar baz delta" + ); + // leading and trailing spaces must be stripped + assert_eq!( + normalize_attribute_value(Cow::Borrowed(b" foo ")).as_ref(), + b"foo" + ); + // leading space + assert_eq!( + normalize_attribute_value(Cow::Borrowed(b" bar")).as_ref(), + b"bar" + ); + // trailing space + assert_eq!( + normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(), + b"baz" + ); + // sequences of spaces must be replaced with a single space + assert_eq!( + normalize_attribute_value(Cow::Borrowed(b" foo bar baz ")).as_ref(), + b"foo bar baz" + ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + assert_eq!( + normalize_attribute_value(Cow::Borrowed( + b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r" + )) + .as_ref(), + b"foo bar baz delta echo foxtrot" + ); + } }