Properly normalize attribute values

closes tafia#371
dralley · Oct 31, 2022 · 561c11b · 561c11b
1 parent a54f6c2
commit 561c11b
Show file tree

Hide file tree

Showing 7 changed files with 273 additions and 11 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -12,6 +12,8 @@
 
 ### New Features
 
+- [#379]: Support for the full XML attribute value normalization process
+
 ### Bug Fixes
 
 - [#490]: Ensure that serialization of map keys always produces valid XML names.
@@ -65,6 +67,11 @@
 - [#489]: Reduced the size of the package uploaded into the crates.io by excluding
   tests, examples, and benchmarks.
 
+### New Tests
+
+- [#379]: Added tests for attribute value normalization
+
+[#379]: https://github.com/tafia/quick-xml/pull/379
 [#481]: https://github.com/tafia/quick-xml/pull/481
 [#489]: https://github.com/tafia/quick-xml/pull/489
 
@@ -105,6 +112,9 @@
 - [#395]: Add support for XML Schema `xs:list`
 - [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
   the XML declared encoding and always use UTF-8
+- [#379]: Add full support for XML attribute value normalization via
+  `Attribute::normalized_value()` and
+  `Attribute::normalized_value_with_custom_entities()`
 - [#416]: Add `borrow()` methods in all event structs which allows to get
   a borrowed version of any event
 - [#437]: Split out namespace reading functionality to a dedicated `NsReader`, namely:

diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
@@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
     ("players.xml", PLAYERS),
 ];
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_str(doc: &str) -> XmlResult<()> {
     let mut r = Reader::from_str(doc);
     loop {
         match criterion::black_box(r.read_event()?) {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
+                    criterion::black_box(attr?.normalized_value()?);
                 }
             }
             Event::Text(e) => {

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -242,6 +242,130 @@ fn attributes(c: &mut Criterion) {
             assert_eq!(count, 150);
         })
     });
+
+    group.finish();
+}
+
+/// Benchmarks normalizing attribute values
+fn attribute_value_normalization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attribute_value_normalization");
+
+    group.bench_function("noop_short", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"foobar")).unwrap();
+        })
+    });
+
+    group.bench_function("noop_long", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
+        })
+    });
+
+    group.bench_function("replacement_chars", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
+        })
+    });
+
+    group.bench_function("char_reference", |b| {
+        b.iter(|| {
+            let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = b"&#38;&#60;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.bench_function("entity_reference", |b| {
+        b.iter(|| {
+            let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = b"&quot;what&apos;s that?&quot;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
+/// benchmark)
+fn bytes_text_unescaped(c: &mut Criterion) {
+    let mut group = c.benchmark_group("BytesText::unescaped");
+    group.bench_function("trim_text = false", |b| {
+        b.iter(|| {
+            let mut buf = Vec::new();
+            let mut r = Reader::from_reader(SAMPLE);
+            r.check_end_names(false).check_comments(false);
+            let mut count = criterion::black_box(0);
+            let mut nbtxt = criterion::black_box(0);
+            loop {
+                match r.read_event(&mut buf) {
+                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
+                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                    Ok(Event::Eof) => break,
+                    _ => (),
+                }
+                buf.clear();
+            }
+            assert_eq!(
+                count, 1550,
+                "Overall tag count in ./tests/documents/sample_rss.xml"
+            );
+
+            // Windows has \r\n instead of \n
+            #[cfg(windows)]
+            assert_eq!(
+                nbtxt, 67661,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+
+            #[cfg(not(windows))]
+            assert_eq!(
+                nbtxt, 66277,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+        });
+    });
+
+    group.bench_function("trim_text = true", |b| {
+        b.iter(|| {
+            let mut buf = Vec::new();
+            let mut r = Reader::from_reader(SAMPLE);
+            r.check_end_names(false)
+                .check_comments(false)
+                .trim_text(true);
+            let mut count = criterion::black_box(0);
+            let mut nbtxt = criterion::black_box(0);
+            loop {
+                match r.read_event(&mut buf) {
+                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
+                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                    Ok(Event::Eof) => break,
+                    _ => (),
+                }
+                buf.clear();
+            }
+            assert_eq!(
+                count, 1550,
+                "Overall tag count in ./tests/documents/sample_rss.xml"
+            );
+
+            // Windows has \r\n instead of \n
+            #[cfg(windows)]
+            assert_eq!(
+                nbtxt, 50334,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+
+            #[cfg(not(windows))]
+            assert_eq!(
+                nbtxt, 50261,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+        });
+    });
     group.finish();
 }
 
@@ -354,6 +478,7 @@ criterion_group!(
     read_resolved_event_into,
     one_event,
     attributes,
+    attribute_value_normalization,
     escaping,
     unescaping,
 );

diff --git a/src/errors.rs b/src/errors.rs
@@ -74,6 +74,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)

diff --git a/src/escapei.rs b/src/escapei.rs
@@ -8,7 +8,7 @@ use std::ops::Range;
 use pretty_assertions::assert_eq;
 
 /// Error for XML escape / unescape.
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum EscapeError {
     /// Entity with Null character
     EntityWithNull(Range<usize>),
@@ -228,9 +228,8 @@ fn named_entity(name: &str) -> Option<&str> {
 #[cfg(feature = "escape-html")]
 fn named_entity(name: &str) -> Option<&str> {
     // imported from https://dev.w3.org/html5/html-author/charref
-    // match over strings are not allowed in const functions
     //TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
-    let s = match name.as_bytes() {
+    match name.as_bytes() {
         b"Tab" => "\u{09}",
         b"NewLine" => "\u{0A}",
         b"excl" => "\u{21}",
@@ -1690,7 +1689,7 @@ fn named_entity(name: &str) -> Option<&str> {
     Some(s)
 }
 
-fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
+pub(crate) fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
     let code = if bytes.starts_with('x') {
         parse_hexadecimal(&bytes[1..])
     } else {
@@ -1705,7 +1704,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
     }
 }
 
-fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
+pub(crate) fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF => 6 characters
     if bytes.len() > 6 {
         return Err(EscapeError::TooLongHexadecimal);
@@ -1723,7 +1722,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
     Ok(code)
 }
 
-fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
+pub(crate) fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF = 1114111 => 7 characters
     if bytes.len() > 7 {
         return Err(EscapeError::TooLongDecimal);

diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -4,6 +4,7 @@
 
 use crate::errors::Result as XmlResult;
 use crate::escape::{escape, unescape_with};
+use crate::escapei::{self, EscapeError};
 use crate::name::QName;
 use crate::reader::{is_whitespace, Reader};
 use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -30,7 +31,84 @@ pub struct Attribute<'a> {
 }
 
 impl<'a> Attribute<'a> {
-    /// Decodes using UTF-8 then unescapes the value.
+    /// Returns the attribute value normalized as per the XML specification.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters \t, \r, \n are replaced with whitespace characters.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities)
+    pub fn normalized_value(&'a self) -> Result<Cow<'a, str>, EscapeError> {
+        self.normalized_value_with(|_| None)
+    }
+
+    /// Returns the attribute value normalized as per the XML specification, using custom entities.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters \t, \r, \n are replaced with whitespace characters.
+    /// Additional entities can be provided in `custom_entities`.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`normalized_value()`](#method.normalized_value)
+    ///
+    /// # Pre-condition
+    ///
+    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
+    pub fn normalized_value_with<'entity>(
+        &'a self,
+        resolve_entity: impl Fn(&str) -> Option<&'entity str>,
+    ) -> Result<Cow<'a, str>, EscapeError> {
+        // TODO: avoid allocation when not needed
+        let mut normalized: Vec<u8> = Vec::with_capacity(self.value.len());
+
+        let attr = self.value.as_ref();
+        let mut attr_iter = attr.iter().enumerate();
+
+        while let Some((idx, ch)) = attr_iter.next() {
+            match ch {
+                b' ' | b'\n' | b'\r' | b'\t' => normalized.push(b' '),
+                b'&' => {
+                    let end = idx
+                        + 1
+                        + attr_iter
+                            .position(|(_, c)| *c == b';')
+                            .ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
+                    let entity = &attr[idx + 1..end]; // starts after the &
+
+                    if let Some(s) = escapei::named_entity(entity) {
+                        normalized.extend_from_slice(s.as_bytes());
+                    } else if entity.starts_with(b"#") {
+                        let entity = &entity[1..]; // starts after the #
+                        let codepoint = escapei::parse_number(entity, idx..end)?;
+                        escapei::push_utf8(&mut normalized, codepoint);
+                    } else if let Some(value) = custom_entities.and_then(|hm| hm.get(entity)) {
+                        // TODO: recursively apply entity substitution
+                        normalized.extend_from_slice(&value);
+                    } else {
+                        return Err(EscapeError::UnrecognizedSymbol(
+                            idx + 1..end,
+                            String::from_utf8(entity.to_vec()),
+                        ));
+                    }
+                }
+                _ => normalized.push(*ch),
+            }
+        }
+
+        Ok(Cow::Owned(normalized))
+    }
+
+    /// Returns the unescaped value.
     ///
     /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
     /// replaced with their unescaped equivalents such as `>`.
@@ -791,6 +869,57 @@ mod xml {
     use super::*;
     use pretty_assertions::assert_eq;
 
+    #[test]
+    fn attribute_value_normalization() {
+        // empty value
+        let raw_value = "".as_bytes();
+        let output = "".as_bytes().to_vec();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
+
+        // return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character
+        let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
+        let output = "  foo bar baz  delta ".as_bytes().to_vec();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
+
+        // entities must be terminated
+        let raw_value = "abc&quotdef".as_bytes();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(
+            attr.normalized_value(),
+            Err(EscapeError::UnterminatedEntity(3..11))
+        );
+
+        // unknown entities raise error
+        let raw_value = "abc&unkn;def".as_bytes();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(
+            attr.normalized_value(),
+            Err(EscapeError::UnrecognizedSymbol(4..8, Ok("unkn".to_owned()))) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code
+        );
+
+        // // custom entity replacement works, entity replacement text processed recursively
+        // let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
+        // let output = b"  A   B  ".to_vec();
+        // let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        // let mut custom_entities = HashMap::new();
+        // custom_entities.insert(b"d".to_vec(), b"&#xD;".to_vec());
+        // custom_entities.insert(b"a".to_vec(), b"&#xA;".to_vec());
+        // custom_entities.insert(b"da".to_vec(), b"&#xD;&#xA;".to_vec());
+        // dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap());
+        // assert_eq!(
+        //     attr.normalized_value_with_custom_entities(&custom_entities),
+        //     Ok(Cow::Owned::<[u8]>(output))
+        // );
+
+        // character literal references are substituted without being replaced by spaces
+        let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
+        let output = "\r\rA\n\nB\r\n".as_bytes().to_vec();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
+    }
+
     /// Checked attribute is the single attribute
     mod single {
         use super::*;