diff --git a/Changelog.md b/Changelog.md index dfed863b..08be8fdd 100644 --- a/Changelog.md +++ b/Changelog.md @@ -37,11 +37,12 @@ resolve predefined entities. it can handle every attribute that does not match existing cases within an enum variant. - [#722]: Allow to pass owned strings to `Writer::create_element`. This is breaking change! - [#275]: Added `ElementWriter::new_line()` which enables pretty printing elements with multiple attributes. -- [#743]: Add `Deserializer::get_ref()` to get XML Reader from serde Deserializer -- [#734]: Add helper functions to resolve predefined XML and HTML5 entities: +- [#743]: Added `Deserializer::get_ref()` to get XML Reader from serde Deserializer +- [#734]: Added helper functions to resolve predefined XML and HTML5 entities: - `quick_xml::escape::resolve_predefined_entity` - `quick_xml::escape::resolve_xml_entity` - `quick_xml::escape::resolve_html5_entity` +- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`. ### Bug Fixes @@ -50,6 +51,7 @@ resolve predefined entities. - [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`. - [#704]: Fix empty tags with attributes not being expanded when `expand_empty_elements` is set to true. - [#683]: Use local tag name when check tag name against possible names for field. +- [#753]: Correctly determine end of processing instructions and XML declaration. ### Misc Changes @@ -98,6 +100,7 @@ resolve predefined entities. [#738]: https://github.com/tafia/quick-xml/pull/738 [#743]: https://github.com/tafia/quick-xml/pull/743 [#748]: https://github.com/tafia/quick-xml/pull/748 +[#753]: https://github.com/tafia/quick-xml/pull/753 [`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 174f4ecb..6436de3a 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -91,6 +91,49 @@ macro_rules! impl_buffered_source { Ok((&buf[start..], done)) } + $($async)? fn read_pi $(<$lf>)? ( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result<(&'b [u8], bool)> { + let mut parser = super::PiParser::default(); + + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e.into())); + } + }; + + match parser.feed(available) { + Some(i) => { + // We does not include `>` in data + buf.extend_from_slice(&available[..i - 1]); + done = true; + i + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self $(.$reader)? .consume(used); + read += used; + } + *position += read; + + Ok((&buf[start..], done)) + } + $($async)? fn read_bang_element $(<$lf>)? ( &mut self, buf: &'b mut Vec, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 5f59446e..6de103af 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -361,7 +361,7 @@ macro_rules! read_until_close { }, // ` match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) + .read_pi($buf, &mut $self.state.offset) $(.$await)? { Ok((bytes, true)) => $self.state.emit_question_mark(bytes), @@ -428,10 +428,12 @@ macro_rules! read_to_end { mod async_tokio; mod buffered_reader; mod ns_reader; +mod pi; mod slice_reader; mod state; pub use ns_reader::NsReader; +pub use pi::PiParser; /// Range of input in bytes, that corresponds to some piece of XML pub type Span = Range; @@ -816,12 +818,29 @@ trait XmlSource<'r, B> { position: &mut usize, ) -> Result<(&'r [u8], bool)>; - /// Read input until comment, CDATA or processing instruction is finished. + /// Read input until processing instruction is finished. + /// + /// This method expect that ``), + /// which does not include into result (`?` at the end included). + /// + /// If input (`Self`) is exhausted and nothing was read, returns `None`. + /// + /// # Parameters + /// - `buf`: Buffer that could be filled from an input (`Self`) and + /// from which [events] could borrow their data + /// - `position`: Will be increased by amount of bytes consumed + /// + /// [events]: crate::events::Event + fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<(&'r [u8], bool)>; + + /// Read input until comment or CDATA is finished. /// /// This method expect that `<` already was read. /// - /// Returns a slice of data read up to end of comment, CDATA or processing - /// instruction (`>`), which does not include into result. + /// Returns a slice of data read up to end of comment or CDATA (`>`), + /// which does not include into result. /// /// If input (`Self`) is exhausted and nothing was read, returns `None`. /// @@ -1764,11 +1783,11 @@ mod test { #[$test] $($async)? fn processing_instruction() { - let mut reader = Reader::from_str(""); + let mut reader = Reader::from_str("\" ?>"); assert_eq!( reader.$read_event($buf) $(.$await)? .unwrap(), - Event::PI(BytesText::from_escaped("xml-stylesheet")) + Event::PI(BytesText::from_escaped("xml-stylesheet '? >\" ")) ); } diff --git a/src/reader/pi.rs b/src/reader/pi.rs new file mode 100644 index 00000000..7729b3ed --- /dev/null +++ b/src/reader/pi.rs @@ -0,0 +1,105 @@ +//! Contains a parser for an XML processing instruction. + +/// A parser that search a `?>` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// processing instruction is ended (the position after `?>`). If search was +/// unsuccessful, a [`None`] will be returned. You typically would expect positive +/// result of search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_xml::reader::PiParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = PiParser::default(); +/// +/// // Parse ` and ?' inside?>and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b" and ?"), None); +/// // ...get another chunk of data +/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(10)); +/// // ^ ^ +/// // 0 10 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct PiParser( + /// A flag that indicates was the `bytes` in the previous attempt to find the + /// end ended with `?`. + pub bool, +); + +impl PiParser { + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// Returns position after the `?>` or `None` if such sequence was not found. + /// + /// [Section 2.6]: Parameter entity references MUST NOT be recognized within + /// processing instructions, so parser do not search for them. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a processing instruction. + /// Should contain text in ASCII-compatible encoding + /// + /// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi + pub fn feed(&mut self, bytes: &[u8]) -> Option { + for i in memchr::memchr_iter(b'>', bytes) { + match i { + // +1 for `>` which should be included in event + 0 if self.0 => return Some(1), + // If the previous byte is `?`, then we found `?>` + // +1 for `>` which should be included in event + i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1), + _ => {} + } + } + self.0 = bytes.last().copied() == Some(b'?'); + None + } +} + +#[test] +fn pi() { + use pretty_assertions::assert_eq; + + /// Returns `Ok(pos)` with the position in the buffer where processing + /// instruction is ended. + /// + /// Returns `Err(internal_state)` if parsing is not done yet. + fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result { + let mut parser = PiParser(had_question_mark); + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser.0), + } + } + + // Comments shows which character was seen the last before calling `feed`. + // `x` means any character, pipe denotes start of the buffer that passed to `feed` + + assert_eq!(parse_pi(b"", false), Err(false)); // x| + assert_eq!(parse_pi(b"", true), Err(false)); // ?| + + assert_eq!(parse_pi(b"?", false), Err(true)); // x|? + assert_eq!(parse_pi(b"?", true), Err(true)); // ?|? + + assert_eq!(parse_pi(b">", false), Err(false)); // x|> + assert_eq!(parse_pi(b">", true), Ok(1)); // ?|> + + assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?> + assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?> + + assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?> + assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?> +} diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 5e686da2..b618ae65 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8}; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -275,6 +275,23 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } + fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<(&'a [u8], bool)> { + let mut parser = PiParser::default(); + + if let Some(i) = parser.feed(self) { + *position += i; + // We does not include `>` in data + let bytes = &self[..i - 1]; + *self = &self[i..]; + Ok((bytes, true)) + } else { + *position += self.len(); + let bytes = &self[..]; + *self = &[]; + Ok((bytes, false)) + } + } + fn read_bang_element( &mut self, _buf: (),