Skip to content

Commit

Permalink
Properly normalize attribute values
Browse files Browse the repository at this point in the history
closes tafia#371
  • Loading branch information
dralley committed Oct 31, 2022
1 parent a54f6c2 commit 561c11b
Show file tree
Hide file tree
Showing 7 changed files with 273 additions and 11 deletions.
10 changes: 10 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

### New Features

- [#379]: Support for the full XML attribute value normalization process

### Bug Fixes

- [#490]: Ensure that serialization of map keys always produces valid XML names.
Expand Down Expand Up @@ -65,6 +67,11 @@
- [#489]: Reduced the size of the package uploaded into the crates.io by excluding
tests, examples, and benchmarks.

### New Tests

- [#379]: Added tests for attribute value normalization

[#379]: https://github.com/tafia/quick-xml/pull/379
[#481]: https://github.com/tafia/quick-xml/pull/481
[#489]: https://github.com/tafia/quick-xml/pull/489

Expand Down Expand Up @@ -105,6 +112,9 @@
- [#395]: Add support for XML Schema `xs:list`
- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
the XML declared encoding and always use UTF-8
- [#379]: Add full support for XML attribute value normalization via
`Attribute::normalized_value()` and
`Attribute::normalized_value_with_custom_entities()`
- [#416]: Add `borrow()` methods in all event structs which allows to get
a borrowed version of any event
- [#437]: Split out namespace reading functionality to a dedicated `NsReader`, namely:
Expand Down
3 changes: 1 addition & 2 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
("players.xml", PLAYERS),
];

// TODO: use fully normalized attribute values
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
let mut r = Reader::from_str(doc);
loop {
match criterion::black_box(r.read_event()?) {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.normalized_value()?);
}
}
Event::Text(e) => {
Expand Down
125 changes: 125 additions & 0 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,130 @@ fn attributes(c: &mut Criterion) {
assert_eq!(count, 150);
})
});

group.finish();
}

/// Benchmarks normalizing attribute values
fn attribute_value_normalization(c: &mut Criterion) {
let mut group = c.benchmark_group("attribute_value_normalization");

group.bench_function("noop_short", |b| {
b.iter(|| {
criterion::black_box(unescape(b"foobar")).unwrap();
})
});

group.bench_function("noop_long", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
})
});

group.bench_function("replacement_chars", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
})
});

group.bench_function("char_reference", |b| {
b.iter(|| {
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
criterion::black_box(unescape(text)).unwrap();
let text = b"&#38;&#60;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("entity_reference", |b| {
b.iter(|| {
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
criterion::black_box(unescape(text)).unwrap();
let text = b"&quot;what&apos;s that?&quot;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.finish();
}

/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
/// benchmark)
fn bytes_text_unescaped(c: &mut Criterion) {
let mut group = c.benchmark_group("BytesText::unescaped");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 67661,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 66277,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});

group.bench_function("trim_text = true", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 50334,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 50261,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});
group.finish();
}

Expand Down Expand Up @@ -354,6 +478,7 @@ criterion_group!(
read_resolved_event_into,
one_event,
attributes,
attribute_value_normalization,
escaping,
unescaping,
);
Expand Down
1 change: 1 addition & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ impl From<EscapeError> for Error {
}

impl From<AttrError> for Error {
/// Creates a new `Error::InvalidAttr` from the given error
#[inline]
fn from(error: AttrError) -> Self {
Error::InvalidAttr(error)
Expand Down
11 changes: 5 additions & 6 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::ops::Range;
use pretty_assertions::assert_eq;

/// Error for XML escape / unescape.
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum EscapeError {
/// Entity with Null character
EntityWithNull(Range<usize>),
Expand Down Expand Up @@ -228,9 +228,8 @@ fn named_entity(name: &str) -> Option<&str> {
#[cfg(feature = "escape-html")]
fn named_entity(name: &str) -> Option<&str> {
// imported from https://dev.w3.org/html5/html-author/charref
// match over strings are not allowed in const functions
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
let s = match name.as_bytes() {
match name.as_bytes() {
b"Tab" => "\u{09}",
b"NewLine" => "\u{0A}",
b"excl" => "\u{21}",
Expand Down Expand Up @@ -1690,7 +1689,7 @@ fn named_entity(name: &str) -> Option<&str> {
Some(s)
}

fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
pub(crate) fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
let code = if bytes.starts_with('x') {
parse_hexadecimal(&bytes[1..])
} else {
Expand All @@ -1705,7 +1704,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
}
}

fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
pub(crate) fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF => 6 characters
if bytes.len() > 6 {
return Err(EscapeError::TooLongHexadecimal);
Expand All @@ -1723,7 +1722,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
Ok(code)
}

fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
pub(crate) fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF = 1114111 => 7 characters
if bytes.len() > 7 {
return Err(EscapeError::TooLongDecimal);
Expand Down
131 changes: 130 additions & 1 deletion src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use crate::errors::Result as XmlResult;
use crate::escape::{escape, unescape_with};
use crate::escapei::{self, EscapeError};
use crate::name::QName;
use crate::reader::{is_whitespace, Reader};
use crate::utils::{write_byte_string, write_cow_string, Bytes};
Expand All @@ -30,7 +31,84 @@ pub struct Attribute<'a> {
}

impl<'a> Attribute<'a> {
/// Decodes using UTF-8 then unescapes the value.
/// Returns the attribute value normalized as per the XML specification.
///
/// https://www.w3.org/TR/xml/#AVNormalize
///
/// Do not use this method with HTML attributes.
///
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
/// and the characters \t, \r, \n are replaced with whitespace characters.
///
/// This will allocate unless the raw attribute value does not require normalization.
///
/// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities)
pub fn normalized_value(&'a self) -> Result<Cow<'a, str>, EscapeError> {
self.normalized_value_with(|_| None)
}

/// Returns the attribute value normalized as per the XML specification, using custom entities.
///
/// https://www.w3.org/TR/xml/#AVNormalize
///
/// Do not use this method with HTML attributes.
///
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
/// and the characters \t, \r, \n are replaced with whitespace characters.
/// Additional entities can be provided in `custom_entities`.
///
/// This will allocate unless the raw attribute value does not require normalization.
///
/// See also [`normalized_value()`](#method.normalized_value)
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn normalized_value_with<'entity>(
&'a self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> Result<Cow<'a, str>, EscapeError> {
// TODO: avoid allocation when not needed
let mut normalized: Vec<u8> = Vec::with_capacity(self.value.len());

let attr = self.value.as_ref();
let mut attr_iter = attr.iter().enumerate();

while let Some((idx, ch)) = attr_iter.next() {
match ch {
b' ' | b'\n' | b'\r' | b'\t' => normalized.push(b' '),
b'&' => {
let end = idx
+ 1
+ attr_iter
.position(|(_, c)| *c == b';')
.ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
let entity = &attr[idx + 1..end]; // starts after the &

if let Some(s) = escapei::named_entity(entity) {
normalized.extend_from_slice(s.as_bytes());
} else if entity.starts_with(b"#") {
let entity = &entity[1..]; // starts after the #
let codepoint = escapei::parse_number(entity, idx..end)?;
escapei::push_utf8(&mut normalized, codepoint);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(entity)) {
// TODO: recursively apply entity substitution
normalized.extend_from_slice(&value);
} else {
return Err(EscapeError::UnrecognizedSymbol(
idx + 1..end,
String::from_utf8(entity.to_vec()),
));
}
}
_ => normalized.push(*ch),
}
}

Ok(Cow::Owned(normalized))
}

/// Returns the unescaped value.
///
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
/// replaced with their unescaped equivalents such as `>`.
Expand Down Expand Up @@ -791,6 +869,57 @@ mod xml {
use super::*;
use pretty_assertions::assert_eq;

#[test]
fn attribute_value_normalization() {
// empty value
let raw_value = "".as_bytes();
let output = "".as_bytes().to_vec();
let attr = Attribute::from(("foo".as_bytes(), raw_value));
assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));

// return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character
let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
let output = " foo bar baz delta ".as_bytes().to_vec();
let attr = Attribute::from(("foo".as_bytes(), raw_value));
assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));

// entities must be terminated
let raw_value = "abc&quotdef".as_bytes();
let attr = Attribute::from(("foo".as_bytes(), raw_value));
assert_eq!(
attr.normalized_value(),
Err(EscapeError::UnterminatedEntity(3..11))
);

// unknown entities raise error
let raw_value = "abc&unkn;def".as_bytes();
let attr = Attribute::from(("foo".as_bytes(), raw_value));
assert_eq!(
attr.normalized_value(),
Err(EscapeError::UnrecognizedSymbol(4..8, Ok("unkn".to_owned()))) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code
);

// // custom entity replacement works, entity replacement text processed recursively
// let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
// let output = b" A B ".to_vec();
// let attr = Attribute::from(("foo".as_bytes(), raw_value));
// let mut custom_entities = HashMap::new();
// custom_entities.insert(b"d".to_vec(), b"&#xD;".to_vec());
// custom_entities.insert(b"a".to_vec(), b"&#xA;".to_vec());
// custom_entities.insert(b"da".to_vec(), b"&#xD;&#xA;".to_vec());
// dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap());
// assert_eq!(
// attr.normalized_value_with_custom_entities(&custom_entities),
// Ok(Cow::Owned::<[u8]>(output))
// );

// character literal references are substituted without being replaced by spaces
let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
let output = "\r\rA\n\nB\r\n".as_bytes().to_vec();
let attr = Attribute::from(("foo".as_bytes(), raw_value));
assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
}

/// Checked attribute is the single attribute
mod single {
use super::*;
Expand Down
Loading

0 comments on commit 561c11b

Please sign in to comment.