Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proper normalize attribute value normalization #379

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,25 @@

### New Features

- [#379]: Improved compliance with the XML attribute value normalization process by
adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`,
which ought to be used in place of `Attribute::unescape_value()` and
`Attribute::unescape_value_with()`

### Bug Fixes

### Misc Changes

- [#379]: Fixed error messages for some variants of `EscapeError`
which were incorrectly described as occurring during escaping (rather than unescaping).
- [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type.

### New Tests

- [#379]: Added tests for attribute value normalization


[#379]: https://github.com/tafia/quick-xml/pull/379
[#650]: https://github.com/tafia/quick-xml/issues/650


Expand Down Expand Up @@ -57,9 +70,6 @@ resolve predefined entities.
- `quick_xml::escape::resolve_html5_entity`
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`.

### Bug Fixes

- [#622]: Fix wrong disregarding of not closed markup, such as lone `<`.
- [#684]: Fix incorrect position reported for `Error::IllFormed(DoubleHyphenInComment)`.
- [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`.
Expand Down Expand Up @@ -120,7 +130,6 @@ resolve predefined entities.
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html


## 0.31.0 -- 2023-10-22

MSRV bumped to 1.56! Crate now uses Rust 2021 edition.
Expand Down Expand Up @@ -290,7 +299,6 @@ serde >= 1.0.181
- [#565]: Correctly set minimum required version of tokio dependency to 1.10
- [#565]: Fix compilation error when build with serde <1.0.139


[externally tagged]: https://serde.rs/enum-representations.html#externally-tagged
[#490]: https://github.com/tafia/quick-xml/pull/490
[#510]: https://github.com/tafia/quick-xml/issues/510
Expand Down
12 changes: 4 additions & 8 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
("players.xml", PLAYERS),
];

// TODO: use fully normalized attribute values
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
let mut r = Reader::from_str(doc);
loop {
match criterion::black_box(r.read_event()?) {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.decode_and_normalize_value(&r)?);
}
}
Event::Text(e) => {
Expand All @@ -67,15 +66,14 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> {
Ok(())
}

// TODO: use fully normalized attribute values
fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
let mut r = Reader::from_reader(doc);
let mut buf = Vec::new();
loop {
match criterion::black_box(r.read_event_into(&mut buf)?) {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.decode_and_normalize_value(&r)?);
}
}
Event::Text(e) => {
Expand All @@ -93,15 +91,14 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
Ok(())
}

// TODO: use fully normalized attribute values
fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
let mut r = NsReader::from_str(doc);
loop {
match criterion::black_box(r.read_resolved_event()?) {
(resolved_ns, Event::Start(e) | Event::Empty(e)) => {
criterion::black_box(resolved_ns);
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.decode_and_normalize_value(&r)?);
}
}
(resolved_ns, Event::Text(e)) => {
Expand All @@ -120,7 +117,6 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
Ok(())
}

// TODO: use fully normalized attribute values
fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
let mut r = NsReader::from_reader(doc);
let mut buf = Vec::new();
Expand All @@ -129,7 +125,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
(resolved_ns, Event::Start(e) | Event::Empty(e)) => {
criterion::black_box(resolved_ns);
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.decode_and_normalize_value(&r)?);
}
}
(resolved_ns, Event::Text(e)) => {
Expand Down
72 changes: 72 additions & 0 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::borrow::Cow;

use criterion::{self, criterion_group, criterion_main, Criterion};
use pretty_assertions::assert_eq;
use quick_xml::escape::{escape, unescape};
use quick_xml::events::attributes::Attribute;
use quick_xml::events::Event;
use quick_xml::name::QName;
use quick_xml::reader::{NsReader, Reader};
Expand Down Expand Up @@ -242,6 +245,74 @@ fn attributes(c: &mut Criterion) {
assert_eq!(count, 150);
})
});

group.finish();
}

/// Benchmarks normalizing attribute values
fn attribute_value_normalization(c: &mut Criterion) {
let mut group = c.benchmark_group("attribute_value_normalization");

group.bench_function("noop_short", |b| {
let attr = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(b"foobar"),
};
b.iter(|| {
criterion::black_box(attr.normalized_value()).unwrap();
})
});

group.bench_function("noop_long", |b| {
let attr = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(LOREM_IPSUM_TEXT.as_bytes()),
};
b.iter(|| {
criterion::black_box(attr.normalized_value()).unwrap();
})
});

group.bench_function("replacement_chars", |b| {
let attr = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(b"just a bit\n of text without\tany entities"),
};
b.iter(|| {
criterion::black_box(attr.normalized_value()).unwrap();
})
});

group.bench_function("char_reference", |b| {
let attr1 = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;"),
};
let attr2 = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(b"&#38;&#60;"),
};
b.iter(|| {
criterion::black_box(attr1.normalized_value()).unwrap();
criterion::black_box(attr2.normalized_value()).unwrap();
})
});

group.bench_function("entity_reference", |b| {
let attr1 = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(b"age &gt; 72 &amp;&amp; age &lt; 21"),
};
let attr2 = Attribute {
key: QName(b"foo"),
value: Cow::Borrowed(b"&quot;what&apos;s that?&quot;"),
};
b.iter(|| {
criterion::black_box(attr1.normalized_value()).unwrap();
criterion::black_box(attr2.normalized_value()).unwrap();
})
});

group.finish();
}

Expand Down Expand Up @@ -354,6 +425,7 @@ criterion_group!(
read_resolved_event_into,
one_event,
attributes,
attribute_value_normalization,
escaping,
unescaping,
);
Expand Down
1 change: 1 addition & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ impl From<EscapeError> for Error {
}

impl From<AttrError> for Error {
/// Creates a new `Error::InvalidAttr` from the given error
#[inline]
fn from(error: AttrError) -> Self {
Error::InvalidAttr(error)
Expand Down
Loading
Loading