Skip to content

Commit

Permalink
Merge pull request #753 from Mingun/fix-pi-parsing
Browse files Browse the repository at this point in the history
Fix processing instruction parsing
  • Loading branch information
Mingun committed Jun 5, 2024
2 parents 3edb78b + 6231ac0 commit 385a1f8
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 9 deletions.
7 changes: 5 additions & 2 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ resolve predefined entities.
it can handle every attribute that does not match existing cases within an enum variant.
- [#722]: Allow to pass owned strings to `Writer::create_element`. This is breaking change!
- [#275]: Added `ElementWriter::new_line()` which enables pretty printing elements with multiple attributes.
- [#743]: Add `Deserializer::get_ref()` to get XML Reader from serde Deserializer
- [#734]: Add helper functions to resolve predefined XML and HTML5 entities:
- [#743]: Added `Deserializer::get_ref()` to get XML Reader from serde Deserializer
- [#734]: Added helper functions to resolve predefined XML and HTML5 entities:
- `quick_xml::escape::resolve_predefined_entity`
- `quick_xml::escape::resolve_xml_entity`
- `quick_xml::escape::resolve_html5_entity`
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.

### Bug Fixes

Expand All @@ -50,6 +51,7 @@ resolve predefined entities.
- [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`.
- [#704]: Fix empty tags with attributes not being expanded when `expand_empty_elements` is set to true.
- [#683]: Use local tag name when check tag name against possible names for field.
- [#753]: Correctly determine end of processing instructions and XML declaration.

### Misc Changes

Expand Down Expand Up @@ -98,6 +100,7 @@ resolve predefined entities.
[#738]: https://github.com/tafia/quick-xml/pull/738
[#743]: https://github.com/tafia/quick-xml/pull/743
[#748]: https://github.com/tafia/quick-xml/pull/748
[#753]: https://github.com/tafia/quick-xml/pull/753
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html
Expand Down
43 changes: 43 additions & 0 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,49 @@ macro_rules! impl_buffered_source {
Ok((&buf[start..], done))
}

$($async)? fn read_pi $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<(&'b [u8], bool)> {
let mut parser = super::PiParser::default();

let mut read = 0;
let mut done = false;
let start = buf.len();
while !done {
let used = {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

match parser.feed(available) {
Some(i) => {
// We does not include `>` in data
buf.extend_from_slice(&available[..i - 1]);
done = true;
i
}
None => {
buf.extend_from_slice(available);
available.len()
}
}
};
self $(.$reader)? .consume(used);
read += used;
}
*position += read;

Ok((&buf[start..], done))
}

$($async)? fn read_bang_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
Expand Down
31 changes: 25 additions & 6 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ macro_rules! read_until_close {
},
// `<?` - processing instruction
Ok(Some(b'?')) => match $reader
.read_bytes_until(b'>', $buf, &mut $self.state.offset)
.read_pi($buf, &mut $self.state.offset)
$(.$await)?
{
Ok((bytes, true)) => $self.state.emit_question_mark(bytes),
Expand Down Expand Up @@ -428,10 +428,12 @@ macro_rules! read_to_end {
mod async_tokio;
mod buffered_reader;
mod ns_reader;
mod pi;
mod slice_reader;
mod state;

pub use ns_reader::NsReader;
pub use pi::PiParser;

/// Range of input in bytes, that corresponds to some piece of XML
pub type Span = Range<usize>;
Expand Down Expand Up @@ -816,12 +818,29 @@ trait XmlSource<'r, B> {
position: &mut usize,
) -> Result<(&'r [u8], bool)>;

/// Read input until comment, CDATA or processing instruction is finished.
/// Read input until processing instruction is finished.
///
/// This method expect that `<?` already was read.
///
/// Returns a slice of data read up to end of processing instruction (`>`),
/// which does not include into result (`?` at the end included).
///
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
///
/// # Parameters
/// - `buf`: Buffer that could be filled from an input (`Self`) and
/// from which [events] could borrow their data
/// - `position`: Will be increased by amount of bytes consumed
///
/// [events]: crate::events::Event
fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<(&'r [u8], bool)>;

/// Read input until comment or CDATA is finished.
///
/// This method expect that `<` already was read.
///
/// Returns a slice of data read up to end of comment, CDATA or processing
/// instruction (`>`), which does not include into result.
/// Returns a slice of data read up to end of comment or CDATA (`>`),
/// which does not include into result.
///
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
///
Expand Down Expand Up @@ -1764,11 +1783,11 @@ mod test {

#[$test]
$($async)? fn processing_instruction() {
let mut reader = Reader::from_str("<?xml-stylesheet?>");
let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Event::PI(BytesText::from_escaped("xml-stylesheet"))
Event::PI(BytesText::from_escaped("xml-stylesheet '? >\" "))
);
}

Expand Down
105 changes: 105 additions & 0 deletions src/reader/pi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
//! Contains a parser for an XML processing instruction.

/// A parser that search a `?>` sequence in the slice.
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with position where
/// processing instruction is ended (the position after `?>`). If search was
/// unsuccessful, a [`None`] will be returned. You typically would expect positive
/// result of search, so that you should feed new data until yo'll get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use quick_xml::reader::PiParser;
/// # use pretty_assertions::assert_eq;
/// let mut parser = PiParser::default();
///
/// // Parse `<?instruction with = 'some > and ?' inside?>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<?instruction"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some > and ?"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(10));
/// // ^ ^
/// // 0 10
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct PiParser(
/// A flag that indicates was the `bytes` in the previous attempt to find the
/// end ended with `?`.
pub bool,
);

impl PiParser {
/// Determines the end position of a processing instruction in the provided slice.
/// Processing instruction ends on the first occurrence of `?>` which cannot be
/// escaped.
///
/// Returns position after the `?>` or `None` if such sequence was not found.
///
/// [Section 2.6]: Parameter entity references MUST NOT be recognized within
/// processing instructions, so parser do not search for them.
///
/// # Parameters
/// - `bytes`: a slice to find the end of a processing instruction.
/// Should contain text in ASCII-compatible encoding
///
/// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
for i in memchr::memchr_iter(b'>', bytes) {
match i {
// +1 for `>` which should be included in event
0 if self.0 => return Some(1),
// If the previous byte is `?`, then we found `?>`
// +1 for `>` which should be included in event
i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1),
_ => {}
}
}
self.0 = bytes.last().copied() == Some(b'?');
None
}
}

#[test]
fn pi() {
use pretty_assertions::assert_eq;

/// Returns `Ok(pos)` with the position in the buffer where processing
/// instruction is ended.
///
/// Returns `Err(internal_state)` if parsing is not done yet.
fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result<usize, bool> {
let mut parser = PiParser(had_question_mark);
match parser.feed(bytes) {
Some(i) => Ok(i),
None => Err(parser.0),
}
}

// Comments shows which character was seen the last before calling `feed`.
// `x` means any character, pipe denotes start of the buffer that passed to `feed`

assert_eq!(parse_pi(b"", false), Err(false)); // x|
assert_eq!(parse_pi(b"", true), Err(false)); // ?|

assert_eq!(parse_pi(b"?", false), Err(true)); // x|?
assert_eq!(parse_pi(b"?", true), Err(true)); // ?|?

assert_eq!(parse_pi(b">", false), Err(false)); // x|>
assert_eq!(parse_pi(b">", true), Ok(1)); // ?|>

assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?>
assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?>

assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?>
assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?>
}
19 changes: 18 additions & 1 deletion src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8};
use crate::errors::{Error, Result, SyntaxError};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource};

/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
/// This implementation supports not using an intermediate buffer as the byte slice
Expand Down Expand Up @@ -275,6 +275,23 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
}
}

fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<(&'a [u8], bool)> {
let mut parser = PiParser::default();

if let Some(i) = parser.feed(self) {
*position += i;
// We does not include `>` in data
let bytes = &self[..i - 1];
*self = &self[i..];
Ok((bytes, true))
} else {
*position += self.len();
let bytes = &self[..];
*self = &[];
Ok((bytes, false))
}
}

fn read_bang_element(
&mut self,
_buf: (),
Expand Down

0 comments on commit 385a1f8

Please sign in to comment.