diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d5aa7db2..56b1aaf9 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -331,6 +331,97 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { } } +/// +/// +/// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way. +/// 2) Begin with a normalized value consisting of the empty string. +/// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following: +/// * For a character reference, append the referenced character to the normalized value. +/// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity. +/// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value. +/// * For another character, append the character to the normalized value. +/// +/// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters, +/// and by replacing sequences of space (#x20) characters by a single space (#x20) character. +/// +/// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced +/// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a +/// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a +/// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value. +fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { + // TODO: character references, entity references, error handling associated with those + + #[derive(PartialEq)] + enum ParseState { + Space, + CDATA, + } + + let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' '); + + let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c)); + + if first_non_space_char.is_none() { + // The entire value was whitespace-like characters + return Cow::Borrowed(b""); + } + + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c)); + + // Trim all whitespace-like characters away from the beginning and end of the attribute value. + let begin = first_non_space_char.unwrap(); + let end = last_non_space_char.unwrap_or(attr.len()); + let trimmed_attr = &attr[begin..=end]; + + // A new buffer is only created when we encounter a situation that requires it. + let mut normalized: Option> = None; + // We start on character data because all whitespace-like characters are already trimmed away. + let mut current_state = ParseState::CDATA; + + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new + // buffer and continue using this buffer. + for (idx, ch) in trimmed_attr.iter().enumerate() { + match ch { + b'\n' | b'\r' | b'\t' | b' ' => match current_state { + ParseState::Space => match normalized { + Some(_) => continue, + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), + }, + ParseState::CDATA => { + current_state = ParseState::Space; + match normalized.as_mut() { + Some(buf) => buf.push(b' '), + None => { + let mut buf = Vec::from(&trimmed_attr[..idx]); + buf.push(b' '); + normalized = Some(buf); + } + } + } + }, + c @ _ => match current_state { + ParseState::Space => { + current_state = ParseState::CDATA; + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + ParseState::CDATA => { + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + }, + } + } + + match normalized { + Some(normalized) => Cow::Owned(normalized), + None => Cow::Borrowed(trimmed_attr), + } +} + impl<'a> Iterator for Attributes<'a> { type Item = Result>; fn next(&mut self) -> Option { @@ -355,7 +446,7 @@ impl<'a> Iterator for Attributes<'a> { ($key:expr, $val:expr) => { Some(Ok(Attribute { key: &self.bytes[$key], - value: Cow::Borrowed(&self.bytes[$val]), + value: normalize_attribute_value(&self.bytes[$val]), })) }; } @@ -513,4 +604,31 @@ mod tests { assert_eq!(&*a.value, b"ee"); assert!(attributes.next().is_none()); } + + #[test] + fn attribute_value_normalization() { + // empty value + assert_eq!(normalize_attribute_value(b"").as_ref(), b""); + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character + assert_eq!( + normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(), + b"foo bar baz delta" + ); + // leading and trailing spaces must be stripped + assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo"); + // leading space + assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar"); + // trailing space + assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz"); + // sequences of spaces must be replaced with a single space + assert_eq!( + normalize_attribute_value(b" foo bar baz ").as_ref(), + b"foo bar baz" + ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + assert_eq!( + normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(), + b"foo bar baz delta echo foxtrot" + ); + } }