Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect missing of trimming all-space text events when trim_text_start = false and trim_text_end = true #755

Merged
merged 13 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@

### Bug Fixes

- [#755]: Fix incorrect missing of trimming all-space text events when
`trim_text_start = false` and `trim_text_end = true`.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re: " events should not be trimmed at boundary of text / CDATA, or text / PI, or text / comment in some cases", is there an issue filed for that with some examples of such boundary cases?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, such issue does not exist. I made investigation in #520 for serde deserializer and implement reading text properly. I cross-checked that using Java XmlBeans library.

### Misc Changes

- [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type.

[#650]: https://github.com/tafia/quick-xml/issues/650
[#755]: https://github.com/tafia/quick-xml/pull/755


## 0.32.0 -- 2024-06-10
Expand Down
15 changes: 2 additions & 13 deletions src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span,
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, ReadTextResult,
Reader, Span,
};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down Expand Up @@ -77,7 +78,6 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
read_event_impl!(
self, buf,
TokioAdapter(&mut self.reader),
read_until_open_async,
read_until_close_async,
await
)
Expand Down Expand Up @@ -141,17 +141,6 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
}

/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
///
/// Returns inner `Ok` if the loop should be broken and an event returned.
/// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
async fn read_until_open_async<'b>(
&mut self,
buf: &'b mut Vec<u8>,
) -> Result<std::result::Result<Event<'b>, &'b mut Vec<u8>>> {
read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await)
}

/// Private function to read until `>` is found. This function expects that
/// it was called just after encounter a `<` symbol.
async fn read_until_close_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
Expand Down
140 changes: 89 additions & 51 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ use std::path::Path;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, Parser, ReadTextResult, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
#[cfg(not(feature = "encoding"))]
#[inline]
$($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
$($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
use crate::encoding::UTF8_BOM;

loop {
Expand All @@ -26,14 +26,14 @@ macro_rules! impl_buffered_source {
Ok(())
},
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}

#[cfg(feature = "encoding")]
#[inline]
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
$($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
Expand All @@ -43,54 +43,106 @@ macro_rules! impl_buffered_source {
Ok(None)
},
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}

#[inline]
$($async)? fn read_text $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> ReadTextResult<'b, &'b mut Vec<u8>> {
let mut read = 0;
let start = buf.len();
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return ReadTextResult::Err(e);
}
};

match memchr::memchr(b'<', available) {
Some(0) => {
self $(.$reader)? .consume(1);
*position += 1;
return ReadTextResult::Markup(buf);
}
Some(i) => {
buf.extend_from_slice(&available[..i]);

let used = i + 1;
self $(.$reader)? .consume(used);
read += used;

*position += read;
return ReadTextResult::UpToMarkup(&buf[start..]);
}
None => {
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
}
}

*position += read;
ReadTextResult::UpToEof(&buf[start..])
}

#[inline]
$($async)? fn read_bytes_until $(<$lf>)? (
&mut self,
byte: u8,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<(&'b [u8], bool)> {
) -> io::Result<(&'b [u8], bool)> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

let mut read = 0;
let mut done = false;
let start = buf.len();
while !done {
let used = {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

match memchr::memchr(byte, available) {
Some(i) => {
buf.extend_from_slice(&available[..i]);
done = true;
i + 1
}
None => {
buf.extend_from_slice(available);
available.len()
}
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(e);
}
};
self $(.$reader)? .consume(used);
read += used;

match memchr::memchr(byte, available) {
Some(i) => {
buf.extend_from_slice(&available[..i]);

let used = i + 1;
self $(.$reader)? .consume(used);
read += used;

*position += read;
return Ok((&buf[start..], true));
}
None => {
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
}
}
*position += read;

Ok((&buf[start..], done))
*position += read;
Ok((&buf[start..], false))
}

#[inline]
Expand Down Expand Up @@ -188,7 +240,7 @@ macro_rules! impl_buffered_source {
}

#[inline]
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => {
Expand All @@ -202,32 +254,18 @@ macro_rules! impl_buffered_source {
}
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}

#[inline]
$($async)? fn skip_one(&mut self, byte: u8) -> Result<bool> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

match self.peek_one() $(.$await)? ? {
Some(b) if b == byte => {
self $(.$reader)? .consume(1);
Ok(true)
}
_ => Ok(false),
}
}

#[inline]
$($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
$($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => Ok(n.first().cloned()),
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}
Expand Down
Loading