From ba66c8e56acf50c7c5e43ef4bd5f55e80d87ae3d Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Fri, 1 Sep 2023 22:13:39 +0300 Subject: [PATCH] Rusty byte strings in RON, deprecate base64 (byte) strings (#438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Switch from base64 to rusty byte strings, deprecate base64 support * Add the Value::Bytes variant * Extend Value tests for Value::String and Value::Bytes * Include byte strings in the RON grammar * Fix ASCII escape decoding for strings and byte strings * Fix byte string error display for #462 test * Fix byte string error test * Add a CHANGELOG entry * Added a deprecation error test for v0.10 * Add tests for v0.9 optional base64 byte string support Co-authored-by: Sebastian Dröge * Add an example for using base64-encoded bytes with ron * Fix formatting in README * Remove outdated extension docs * Add tests for unescaped and raw byte strings * Fix fuzzer-found issue with serialising invalid UTF-8 byte strings * Fix fuzzer found issue with `br#` being parsed as the identifier `br` * Fix parsing of byte escapes in UTF-8 strings to produce proper Unicode characters * Fix fuzzer-found interaction with unwrap_variant_newtypes * Add support for strongly typed byte literals * Add missing Value serialising tests * Add test to show that #436 is solved with strongly typed base64 user-side types * Add more coverage tests --------- Co-authored-by: Sebastian Dröge --- CHANGELOG.md | 2 + Cargo.toml | 2 + README.md | 1 + docs/grammar.md | 30 +- examples/base64.rs | 146 ++++++++++ src/de/mod.rs | 24 +- src/de/tests.rs | 2 +- src/de/value.rs | 2 +- src/error.rs | 34 ++- src/parse.rs | 364 ++++++++++++++++++++---- src/ser/mod.rs | 46 ++- src/ser/tests.rs | 18 +- src/ser/value.rs | 1 + src/value/mod.rs | 33 +++ tests/407_raw_value.rs | 2 +- tests/436_untagged_bytes.rs | 102 +++++++ tests/438_rusty_byte_strings.rs | 450 ++++++++++++++++++++++++++++++ tests/462_bytes.rs | 2 +- tests/465_ser_backslash_string.rs | 49 ++-- tests/comments.rs | 11 + tests/escape.rs | 8 + tests/value.rs | 56 +++- 22 files changed, 1271 insertions(+), 114 deletions(-) create mode 100644 examples/base64.rs create mode 100644 tests/436_untagged_bytes.rs create mode 100644 tests/438_rusty_byte_strings.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 36fd5fad..a11ba87d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Allow `ron::value::RawValue` to capture any whitespace to the left and right of a ron value ([#487](https://github.com/ron-rs/ron/pull/487)) - Fix serialising reserved identifiers `true`, `false`, `Some`, `None`, `inf`[`f32`|`f64`], and `Nan`[`f32`|`f64`] ([#487](https://github.com/ron-rs/ron/pull/487)) - Disallow unclosed line comments at the end of `ron::value::RawValue` ([#489](https://github.com/ron-rs/ron/pull/489)) +- **Format-Breaking:** Switch from base64-encoded to Rusty byte strings, still allow base64 deserialising for now ([#438](https://github.com/ron-rs/ron/pull/438)) +- Add support for byte literals as strongly typed unsigned 8-bit integers ([#438](https://github.com/ron-rs/ron/pull/438)) ## [0.8.1] - 2023-08-17 diff --git a/Cargo.toml b/Cargo.toml index 1a338a8f..ad9625cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ default = [] integer128 = [] [dependencies] +# FIXME @juntyr remove base64 once old byte strings are fully deprecated base64 = "0.21" bitflags = { version = "2.0", features = ["serde"] } indexmap = { version = "2.0", features = ["serde"], optional = true } @@ -37,3 +38,4 @@ serde_bytes = "0.11" serde_json = "1.0" option_set = "0.2" typetag = "0.2" +bytes = { version = "1.3", features = ["serde"] } diff --git a/README.md b/README.md index 29b11366..a9222492 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,7 @@ While data structures with any of these attributes should roundtrip through RON, * Numbers: `42`, `3.14`, `0xFF`, `0b0110` * Strings: `"Hello"`, `"with\\escapes\n"`, `r#"raw string, great for regex\."#` +* Byte Strings: `b"Hello"`, `b"with \x65\x73\x63\x61\x70\x65\x73\n"`, `br#"raw, too"#` * Booleans: `true`, `false` * Chars: `'e'`, `'\n'` * Optionals: `Some("string")`, `Some(Some(1.34))`, `None` diff --git a/docs/grammar.md b/docs/grammar.md index 0e4741ef..6065edb4 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -40,7 +40,7 @@ For the extension names see the [`extensions.md`][exts] document. ## Value ```ebnf -value = integer | float | string | char | bool | option | list | map | tuple | struct | enum_variant; +value = integer | byte | float | string | byte_string | char | bool | option | list | map | tuple | struct | enum_variant; ``` ## Numbers @@ -60,6 +60,8 @@ unsigned_octal = "0o", digit_octal, { digit_octal | "_" }; unsigned_hexadecimal = "0x", digit_hexadecimal, { digit_hexadecimal | "_" }; unsigned_decimal = digit, { digit | "_" }; +byte = ascii | ("\\", (escape_ascii | escape_byte)); + float = ["+" | "-"], ("inf" | "NaN" | float_num), [float_suffix]; float_num = (float_int | float_std | float_frac), [float_exp]; float_int = digit, { digit | "_" }; @@ -74,9 +76,13 @@ float_suffix = "f", ("32", "64"); ```ebnf string = string_std | string_raw; string_std = "\"", { no_double_quotation_marks | string_escape }, "\""; -string_escape = "\\", ("\"" | "\\" | "b" | "f" | "n" | "r" | "t" | ("u", unicode_hex)); -string_raw = "r" string_raw_content; +string_escape = "\\", (escape_ascii | escape_byte | escape_unicode); +string_raw = "r", string_raw_content; string_raw_content = ("#", string_raw_content, "#") | "\"", { unicode_non_greedy }, "\""; + +escape_ascii = "'" | "\"" | "\\" | "n" | "r" | "t" | "0"; +escape_byte = "x", digit_hexadecimal, digit_hexadecimal; +escape_unicode = "u", digit_hexadecimal, [digit_hexadecimal, [digit_hexadecimal, [digit_hexadecimal, [digit_hexadecimal, [digit_hexadecimal]]]]]; ``` > Note: Raw strings start with an `r`, followed by n `#`s and a quotation mark @@ -93,6 +99,24 @@ Also see [the Rust document] about context-sensitivity of raw strings. [the Rust document]: https://github.com/rust-lang/rust/blob/d046ffddc4bd50e04ffc3ff9f766e2ac71f74d50/src/grammar/raw-string-literal-ambiguity.md +## Byte String + +```ebnf +byte_string = byte_string_std | byte_string_raw; +byte_string_std = "b\"", { no_double_quotation_marks | string_escape }, "\""; +byte_string_raw = "br", string_raw_content; +``` + +> Note: Byte strings are similar to normal strings but are not required to + contain only valid UTF-8 text. RON's byte strings follow the updated Rust + byte string literal rules as proposed in [RFC #3349], i.e. byte strings + allow the exact same characters and escape codes as normal strings. + +[RFC #3349](https://github.com/rust-lang/rfcs/pull/3349) + +> Note: Raw byte strings start with an `br` prefix and follow the same rules + as raw strings, which are outlined above. + ## Char ```ebnf diff --git a/examples/base64.rs b/examples/base64.rs new file mode 100644 index 00000000..9349d17f --- /dev/null +++ b/examples/base64.rs @@ -0,0 +1,146 @@ +//! ron initially encoded byte-slices and byte-bufs as base64-encoded strings. +//! However, since v0.9, ron now uses Rusty byte string literals instead. +//! +//! This example shows how the previous behaviour can be restored by serialising +//! bytes with strongly-typed base64-encoded strings, or accepting both Rusty +//! byte strings and the legacy base64-encoded string syntax. + +use base64::engine::{general_purpose::STANDARD as BASE64, Engine}; +use serde::{de::Visitor, Deserialize, Deserializer, Serialize, Serializer}; + +#[derive(Debug, PartialEq, Serialize, Deserialize)] +struct Config { + #[serde(with = "ByteStr")] + bytes: Vec, + #[serde(with = "Base64")] + base64: Vec, + #[serde(with = "ByteStrOrBase64")] + bytes_or_base64: Vec, +} + +enum ByteStr {} + +impl ByteStr { + fn serialize(data: &[u8], serializer: S) -> Result { + serializer.serialize_bytes(data) + } + + fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result, D::Error> { + struct ByteStrVisitor; + + impl<'de> Visitor<'de> for ByteStrVisitor { + type Value = Vec; + + fn expecting(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.write_str("a Rusty byte string") + } + + fn visit_bytes(self, bytes: &[u8]) -> Result { + Ok(bytes.to_vec()) + } + + fn visit_byte_buf(self, bytes: Vec) -> Result { + Ok(bytes) + } + } + + deserializer.deserialize_byte_buf(ByteStrVisitor) + } +} + +enum Base64 {} + +impl Base64 { + fn serialize(data: &[u8], serializer: S) -> Result { + serializer.serialize_str(&BASE64.encode(data)) + } + + fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result, D::Error> { + let base64_str = <&str>::deserialize(deserializer)?; + BASE64.decode(base64_str).map_err(serde::de::Error::custom) + } +} + +enum ByteStrOrBase64 {} + +impl ByteStrOrBase64 { + fn serialize(data: &[u8], serializer: S) -> Result { + if cfg!(all()) { + // either of these would work + serializer.serialize_str(&BASE64.encode(data)) + } else { + serializer.serialize_bytes(data) + } + } + + fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result, D::Error> { + struct ByteStrOrBase64Visitor; + + impl<'de> Visitor<'de> for ByteStrOrBase64Visitor { + type Value = Vec; + + fn expecting(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.write_str("a Rusty byte string or a base64-encoded string") + } + + fn visit_str(self, base64_str: &str) -> Result { + BASE64.decode(base64_str).map_err(serde::de::Error::custom) + } + + fn visit_bytes(self, bytes: &[u8]) -> Result { + Ok(bytes.to_vec()) + } + + fn visit_byte_buf(self, bytes: Vec) -> Result { + Ok(bytes) + } + } + + deserializer.deserialize_any(ByteStrOrBase64Visitor) + } +} + +fn main() { + let ron = r#"Config( + bytes: b"only byte strings are allowed", + base64: "b25seSBiYXNlNjQtZW5jb2RlZCBzdHJpbmdzIGFyZSBhbGxvd2Vk", + bytes_or_base64: b"both byte strings and base64-encoded strings work", + )"#; + + assert_eq!( + ron::from_str::(ron).unwrap(), + Config { + bytes: b"only byte strings are allowed".to_vec(), + base64: b"only base64-encoded strings are allowed".to_vec(), + bytes_or_base64: b"both byte strings and base64-encoded strings work".to_vec() + } + ); + + let ron = r#"Config( + bytes: b"only byte strings are allowed", + base64: "b25seSBiYXNlNjQtZW5jb2RlZCBzdHJpbmdzIGFyZSBhbGxvd2Vk", + bytes_or_base64: "Ym90aCBieXRlIHN0cmluZ3MgYW5kIGJhc2U2NC1lbmNvZGVkIHN0cmluZ3Mgd29yaw==", + )"#; + + assert_eq!( + ron::from_str::(ron).unwrap(), + Config { + bytes: b"only byte strings are allowed".to_vec(), + base64: b"only base64-encoded strings are allowed".to_vec(), + bytes_or_base64: b"both byte strings and base64-encoded strings work".to_vec() + } + ); + + println!( + "{}", + ron::ser::to_string_pretty( + &Config { + bytes: b"only byte strings are allowed".to_vec(), + base64: b"only base64-encoded strings are allowed".to_vec(), + bytes_or_base64: b"both byte strings and base64-encoded strings work".to_vec() + }, + ron::ser::PrettyConfig::default().struct_names(true) + ) + .unwrap() + ); +} diff --git a/src/de/mod.rs b/src/de/mod.rs index 756ab4e9..8ac0e83c 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -5,7 +5,6 @@ use std::{ str, }; -use base64::Engine; use serde::{ de::{self, DeserializeSeed, Deserializer as _, Visitor}, Deserialize, @@ -17,7 +16,7 @@ use crate::{ error::{Result, SpannedResult}, extensions::Extensions, options::Options, - parse::{Bytes, NewtypeMode, ParsedStr, StructType, TupleMode, BASE64_ENGINE}, + parse::{Bytes, NewtypeMode, ParsedByteStr, ParsedStr, StructType, TupleMode}, }; mod id; @@ -322,8 +321,12 @@ impl<'de, 'a> de::Deserializer<'de> for &'a mut Deserializer<'de> { b'{' => self.deserialize_map(visitor), b'0'..=b'9' | b'+' | b'-' | b'.' => self.bytes.any_number()?.visit(visitor), b'"' | b'r' => self.deserialize_string(visitor), + b'b' if matches!(self.bytes.bytes().get(1), Some(b'\'')) => { + self.bytes.any_number()?.visit(visitor) + } + b'b' => self.deserialize_byte_buf(visitor), b'\'' => self.deserialize_char(visitor), - other => Err(Error::UnexpectedByte(other as char)), + other => Err(Error::UnexpectedByte(other)), } } @@ -460,18 +463,9 @@ impl<'de, 'a> de::Deserializer<'de> for &'a mut Deserializer<'de> { return visitor.visit_byte_buf(bytes); } - let res = { - let string = self.bytes.string()?; - let base64_str = match string { - ParsedStr::Allocated(ref s) => s.as_str(), - ParsedStr::Slice(s) => s, - }; - BASE64_ENGINE.decode(base64_str) - }; - - match res { - Ok(byte_buf) => visitor.visit_byte_buf(byte_buf), - Err(err) => Err(Error::Base64Error(err)), + match self.bytes.byte_string()? { + ParsedByteStr::Allocated(byte_buf) => visitor.visit_byte_buf(byte_buf), + ParsedByteStr::Slice(bytes) => visitor.visit_borrowed_bytes(bytes), } } diff --git a/src/de/tests.rs b/src/de/tests.rs index d61405a1..719402e7 100644 --- a/src/de/tests.rs +++ b/src/de/tests.rs @@ -334,7 +334,7 @@ fn test_byte_stream() { small: vec![1, 2], large: vec![1, 2, 3, 4] }), - from_str("BytesStruct( small:[1, 2], large:\"AQIDBA==\" )"), + from_str("BytesStruct( small:[1, 2], large:b\"\\x01\\x02\\x03\\x04\" )"), ); } diff --git a/src/de/value.rs b/src/de/value.rs index 8be45b69..6e467970 100644 --- a/src/de/value.rs +++ b/src/de/value.rs @@ -167,7 +167,7 @@ impl<'de> Visitor<'de> for ValueVisitor { where E: Error, { - self.visit_string(String::from_utf8(v).map_err(|e| Error::custom(format!("{}", e)))?) + Ok(Value::Bytes(v)) } fn visit_none(self) -> Result diff --git a/src/error.rs b/src/error.rs index 9555dce5..6c7cd688 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,8 +1,8 @@ -use std::{error::Error as StdError, fmt, io, str::Utf8Error, string::FromUtf8Error}; +use std::{error::Error as StdError, fmt, io, str::Utf8Error}; use serde::{de, ser}; -use crate::parse::{is_ident_first_char, is_ident_other_char, is_ident_raw_char, BASE64_ENGINE}; +use crate::parse::{is_ident_first_char, is_ident_other_char, is_ident_raw_char}; /// This type represents all possible errors that can occur when /// serializing or deserializing RON data. @@ -20,6 +20,10 @@ pub type SpannedResult = std::result::Result; pub enum Error { Io(String), Message(String), + #[deprecated( + since = "0.9.0", + note = "ambiguous base64 byte strings are replaced by strongly typed Rusty b\"byte strings\"" + )] Base64Error(base64::DecodeError), Eof, ExpectedArray, @@ -29,6 +33,7 @@ pub enum Error { ExpectedBoolean, ExpectedComma, ExpectedChar, + ExpectedByteLiteral, ExpectedFloat, FloatUnderscore, ExpectedInteger, @@ -46,6 +51,7 @@ pub enum Error { ExpectedStructLikeEnd, ExpectedUnit, ExpectedString, + ExpectedByteString, ExpectedStringEnd, ExpectedIdentifier, @@ -62,7 +68,7 @@ pub enum Error { UnclosedBlockComment, UnclosedLineComment, UnderscoreAtBeginning, - UnexpectedByte(char), + UnexpectedByte(u8), Utf8Error(Utf8Error), TrailingCharacters, @@ -114,6 +120,7 @@ impl fmt::Display for Error { match *self { Error::Io(ref s) => f.write_str(s), Error::Message(ref s) => f.write_str(s), + #[allow(deprecated)] Error::Base64Error(ref e) => fmt::Display::fmt(e, f), Error::Eof => f.write_str("Unexpected end of RON"), Error::ExpectedArray => f.write_str("Expected opening `[`"), @@ -125,6 +132,7 @@ impl fmt::Display for Error { Error::ExpectedBoolean => f.write_str("Expected boolean"), Error::ExpectedComma => f.write_str("Expected comma"), Error::ExpectedChar => f.write_str("Expected char"), + Error::ExpectedByteLiteral => f.write_str("Expected byte literal"), Error::ExpectedFloat => f.write_str("Expected float"), Error::FloatUnderscore => f.write_str("Unexpected underscore in float"), Error::ExpectedInteger => f.write_str("Expected integer"), @@ -153,6 +161,7 @@ impl fmt::Display for Error { Error::ExpectedStructLikeEnd => f.write_str("Expected closing `)`"), Error::ExpectedUnit => f.write_str("Expected unit"), Error::ExpectedString => f.write_str("Expected string"), + Error::ExpectedByteString => f.write_str("Expected byte string"), Error::ExpectedStringEnd => f.write_str("Expected end of string"), Error::ExpectedIdentifier => f.write_str("Expected identifier"), Error::InvalidEscape(s) => f.write_str(s), @@ -172,7 +181,11 @@ impl fmt::Display for Error { Error::UnderscoreAtBeginning => { f.write_str("Unexpected leading underscore in a number") } - Error::UnexpectedByte(ref byte) => write!(f, "Unexpected byte {:?}", byte), + Error::UnexpectedByte(byte) => { + let escaped_byte = std::ascii::escape_default(byte) + .map(char::from).collect::(); + write!(f, "Unexpected byte '{}'", escaped_byte) + }, Error::TrailingCharacters => f.write_str("Non-whitespace trailing characters"), Error::InvalidValueForType { ref expected, @@ -314,8 +327,11 @@ impl de::Error for Error { Float(n) => write!(f, "the floating point number `{}`", n), Char(c) => write!(f, "the UTF-8 character `{}`", c), Str(s) => write!(f, "the string {:?}", s), - Bytes(b) => write!(f, "the bytes \"{}\"", { - base64::display::Base64Display::new(b, &BASE64_ENGINE) + Bytes(b) => write!(f, "the byte string b\"{}\"", { + b.iter() + .flat_map(|c| std::ascii::escape_default(*c)) + .map(char::from) + .collect::() }), Unit => write!(f, "a unit value"), Option => write!(f, "an optional value"), @@ -384,12 +400,6 @@ impl From for Error { } } -impl From for Error { - fn from(e: FromUtf8Error) -> Self { - Error::Utf8Error(e.utf8_error()) - } -} - impl From for Error { fn from(e: io::Error) -> Self { Error::Io(e.to_string()) diff --git a/src/parse.rs b/src/parse.rs index 4cb606a5..e5fd6b33 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -2,19 +2,15 @@ use std::{ char::from_u32 as char_from_u32, - str::{from_utf8, from_utf8_unchecked, FromStr}, + str::{from_utf8, from_utf8_unchecked, FromStr, Utf8Error}, }; -use base64::engine::general_purpose::{GeneralPurpose, STANDARD}; - use crate::{ error::{Error, Position, Result, SpannedError, SpannedResult}, extensions::Extensions, value::Number, }; -pub const BASE64_ENGINE: GeneralPurpose = STANDARD; - // We have the following char categories. const INT_CHAR: u8 = 1 << 0; // [0-9A-Fa-f_] const FLOAT_CHAR: u8 = 1 << 1; // [0-9\.Ee+-_] @@ -214,6 +210,7 @@ impl<'a> Bytes<'a> { if digit >= base { let _ = bytes.advance(i); + // we know that the byte is an ASCII character here return Err(Error::InvalidIntegerDigit { digit: char::from(byte), base, @@ -250,6 +247,32 @@ impl<'a> Bytes<'a> { let _ = self.advance_single(); true } + b'b' if self.consume("b'") => { + // Parse a byte literal + let byte = match self.eat_byte()? { + b'\\' => match self.parse_escape(EscapeEncoding::Binary, true)? { + // we know that this byte is an ASCII character + EscapeCharacter::Ascii(b) => b, + EscapeCharacter::Utf8(_) => { + return Err(Error::InvalidEscape( + "Unexpected Unicode escape in byte literal", + )) + } + }, + b => b, + }; + + if !self.consume("'") { + return Err(Error::ExpectedByteLiteral); + } + + // Safety: The byte contains the ASCII-only byte literal + let bytes_ron = unsafe { + from_utf8_unchecked(&bytes_backup[..bytes_backup.len() - self.bytes.len()]) + }; + + return T::try_from_parsed_integer(ParsedInteger::U8(byte), bytes_ron); + } _ => false, }; let sign = if is_negative { -1 } else { 1 }; @@ -437,7 +460,11 @@ impl<'a> Bytes<'a> { let c = if c == b'\\' { let _ = self.advance(1); - self.parse_escape()? + match self.parse_escape(EscapeEncoding::Utf8, true)? { + // we know that this byte is an ASCII character + EscapeCharacter::Ascii(b) => char::from(b), + EscapeCharacter::Utf8(c) => c, + } } else { // Check where the end of the char (') is and try to // interpret the rest as UTF-8 @@ -556,6 +583,10 @@ impl<'a> Bytes<'a> { if bytes_copy.string().is_ok() { *bytes = bytes_copy; } + let mut bytes_copy = *bytes; + if bytes_copy.byte_string().is_ok() { + *bytes = bytes_copy; + } let c = bytes.eat_byte()?; if c == b'(' || c == b'[' || c == b'{' { @@ -768,6 +799,7 @@ impl<'a> Bytes<'a> { _ => (), } + // we know that the byte is an ASCII character here f.push(char::from(*b)); } @@ -827,8 +859,19 @@ impl<'a> Bytes<'a> { return Err(Error::ExpectedIdentifier); } - // If the next two bytes signify the start of a raw string literal, - // return an error. + // If the next two bytes signify the start of a (raw) byte string + // literal, return an error. + if next == b'b' { + match self.bytes.get(1) { + Some(b'"' | b'\'') => return Err(Error::ExpectedIdentifier), + Some(b'r') => match self.bytes.get(2) { + Some(b'#' | b'"') => return Err(Error::ExpectedIdentifier), + Some(_) | None => (), + }, + Some(_) | None => (), + } + }; + let length = if next == b'r' { match self.bytes.get(1) { Some(b'"') => return Err(Error::ExpectedIdentifier), @@ -950,6 +993,90 @@ impl<'a> Bytes<'a> { self.bytes.first().copied().ok_or(Error::Eof) } + pub fn byte_string(&mut self) -> Result> { + fn expected_byte_string_found_base64( + base64_str: &ParsedStr, + byte_str: &ParsedByteStr, + ) -> Error { + let byte_str = match &byte_str { + ParsedByteStr::Allocated(b) => b.as_slice(), + ParsedByteStr::Slice(b) => b, + } + .iter() + .flat_map(|c| std::ascii::escape_default(*c)) + .map(char::from) + .collect::(); + let base64_str = match &base64_str { + ParsedStr::Allocated(s) => s.as_str(), + ParsedStr::Slice(s) => s, + }; + + Error::InvalidValueForType { + expected: format!("the Rusty byte string b\"{}\"", byte_str), + found: format!("the ambiguous base64 string {:?}", base64_str), + } + } + + if self.consume("\"") { + let base64_str = self.escaped_string()?; + let base64_result = ParsedByteStr::try_from_base64(base64_str.clone()); + + if cfg!(not(test)) { + // FIXME @juntyr: remove in v0.10 + #[allow(deprecated)] + base64_result.map_err(Error::Base64Error) + } else { + match base64_result { + // FIXME @juntyr: enable in v0.10 + Ok(byte_str) => Err(expected_byte_string_found_base64(&base64_str, &byte_str)), + Err(_) => Err(Error::ExpectedByteString), + } + } + } else if self.consume("r") { + let base64_str = self.raw_string()?; + let base64_result = ParsedByteStr::try_from_base64(base64_str.clone()); + + if cfg!(not(test)) { + // FIXME @juntyr: remove in v0.10 + #[allow(deprecated)] + base64_result.map_err(Error::Base64Error) + } else { + match base64_result { + // FIXME @juntyr: enable in v0.10 + Ok(byte_str) => Err(expected_byte_string_found_base64(&base64_str, &byte_str)), + Err(_) => Err(Error::ExpectedByteString), + } + } + } else if self.consume("b\"") { + self.escaped_byte_string() + } else if self.consume("br") { + self.raw_byte_string() + } else { + Err(Error::ExpectedByteString) + } + } + + fn escaped_byte_string(&mut self) -> Result> { + match self.escaped_byte_buf(EscapeEncoding::Binary) { + Ok((bytes, advance)) => { + let _ = self.advance(advance); + Ok(bytes) + } + Err(err) => Err(err), + } + } + + fn raw_byte_string(&mut self) -> Result> { + match self.raw_byte_buf() { + Ok((bytes, advance)) => { + let _ = self.advance(advance); + Ok(bytes) + } + Err(Error::ExpectedString) => Err(Error::ExpectedByteString), + Err(err) => Err(err), + } + } + pub fn string(&mut self) -> Result> { if self.consume("\"") { self.escaped_string() @@ -961,6 +1088,28 @@ impl<'a> Bytes<'a> { } fn escaped_string(&mut self) -> Result> { + match self.escaped_byte_buf(EscapeEncoding::Utf8) { + Ok((bytes, advance)) => { + let string = ParsedStr::try_from_bytes(bytes).map_err(Error::from)?; + let _ = self.advance(advance); + Ok(string) + } + Err(err) => Err(err), + } + } + + fn raw_string(&mut self) -> Result> { + match self.raw_byte_buf() { + Ok((bytes, advance)) => { + let string = ParsedStr::try_from_bytes(bytes).map_err(Error::from)?; + let _ = self.advance(advance); + Ok(string) + } + Err(err) => Err(err), + } + } + + fn escaped_byte_buf(&mut self, encoding: EscapeEncoding) -> Result<(ParsedByteStr<'a>, usize)> { use std::iter::repeat; let (i, end_or_escape) = self @@ -971,27 +1120,28 @@ impl<'a> Bytes<'a> { .ok_or(Error::ExpectedStringEnd)?; if *end_or_escape == b'"' { - let s = from_utf8(&self.bytes[..i]).map_err(Error::from)?; + let s = &self.bytes[..i]; // Advance by the number of bytes of the string // + 1 for the `"`. - let _ = self.advance(i + 1); - - Ok(ParsedStr::Slice(s)) + Ok((ParsedByteStr::Slice(s), i + 1)) } else { let mut i = i; let mut s: Vec<_> = self.bytes[..i].to_vec(); loop { let _ = self.advance(i + 1); - let character = self.parse_escape()?; - match character.len_utf8() { - 1 => s.push(character as u8), - len => { - let start = s.len(); - s.extend(repeat(0).take(len)); - character.encode_utf8(&mut s[start..]); - } + + match self.parse_escape(encoding, false)? { + EscapeCharacter::Ascii(c) => s.push(c), + EscapeCharacter::Utf8(c) => match c.len_utf8() { + 1 => s.push(c as u8), + len => { + let start = s.len(); + s.extend(repeat(0).take(len)); + c.encode_utf8(&mut s[start..]); + } + }, } let (new_i, end_or_escape) = self @@ -1005,16 +1155,14 @@ impl<'a> Bytes<'a> { s.extend_from_slice(&self.bytes[..i]); if *end_or_escape == b'"' { - let _ = self.advance(i + 1); - - let s = String::from_utf8(s).map_err(Error::from)?; - break Ok(ParsedStr::Allocated(s)); + // Advance to the end of the string + 1 for the `"`. + break Ok((ParsedByteStr::Allocated(s), i + 1)); } } } } - fn raw_string(&mut self) -> Result> { + fn raw_byte_buf(&mut self) -> Result<(ParsedByteStr<'a>, usize)> { let num_hashes = self.bytes.iter().take_while(|&&b| b == b'#').count(); let hashes = &self.bytes[..num_hashes]; let _ = self.advance(num_hashes); @@ -1030,13 +1178,11 @@ impl<'a> Bytes<'a> { .position(|window| window == ending.as_slice()) .ok_or(Error::ExpectedStringEnd)?; - let s = from_utf8(&self.bytes[..i]).map_err(Error::from)?; + let s = &self.bytes[..i]; - // Advance by the number of bytes of the string + // Advance by the number of bytes of the byte string // + `num_hashes` + 1 for the `"`. - let _ = self.advance(i + num_hashes + 1); - - Ok(ParsedStr::Slice(s)) + Ok((ParsedByteStr::Slice(s), i + num_hashes + 1)) } fn test_for(&self, s: &str) -> bool { @@ -1065,16 +1211,57 @@ impl<'a> Bytes<'a> { } } - fn parse_escape(&mut self) -> Result { + fn parse_escape(&mut self, encoding: EscapeEncoding, is_char: bool) -> Result { let c = match self.eat_byte()? { - b'\'' => '\'', - b'"' => '"', - b'\\' => '\\', - b'n' => '\n', - b'r' => '\r', - b't' => '\t', - b'0' => '\0', - b'x' => self.decode_ascii_escape()? as char, + b'\'' => EscapeCharacter::Ascii(b'\''), + b'"' => EscapeCharacter::Ascii(b'"'), + b'\\' => EscapeCharacter::Ascii(b'\\'), + b'n' => EscapeCharacter::Ascii(b'\n'), + b'r' => EscapeCharacter::Ascii(b'\r'), + b't' => EscapeCharacter::Ascii(b'\t'), + b'0' => EscapeCharacter::Ascii(b'\0'), + b'x' => { + // Fast exit for ascii escape in byte string + let b: u8 = self.decode_ascii_escape()?; + if let EscapeEncoding::Binary = encoding { + return Ok(EscapeCharacter::Ascii(b)); + } + + // Fast exit for ascii character in UTF-8 string + let mut bytes = [b, 0, 0, 0]; + if let Ok(Some(c)) = from_utf8(&bytes[..=0]).map(|s| s.chars().next()) { + return Ok(EscapeCharacter::Utf8(c)); + } + + if is_char { + // Character literals are not allowed to use multiple byte + // escapes to build a unicode character + return Err(Error::InvalidEscape( + "Not a valid byte-escaped Unicode character", + )); + } + + // UTF-8 character needs up to four bytes and we have already + // consumed one, so at most three to go + for i in 1..4 { + if !self.consume(r"\x") { + return Err(Error::InvalidEscape( + "Not a valid byte-escaped Unicode character", + )); + } + + bytes[i] = self.decode_ascii_escape()?; + + // Check if we now have a valid UTF-8 character + if let Ok(Some(c)) = from_utf8(&bytes[..=i]).map(|s| s.chars().next()) { + return Ok(EscapeCharacter::Utf8(c)); + } + } + + return Err(Error::InvalidEscape( + "Not a valid byte-escaped Unicode character", + )); + } b'u' => { self.expect_byte(b'{', Error::InvalidEscape("Missing { in Unicode escape"))?; @@ -1107,11 +1294,13 @@ impl<'a> Bytes<'a> { b'}', Error::InvalidEscape("No } at the end of Unicode escape"), )?; - char_from_u32(bytes).ok_or(Error::InvalidEscape("Not a valid char"))? - } - _ => { - return Err(Error::InvalidEscape("Unknown escape character")); + let c = char_from_u32(bytes).ok_or(Error::InvalidEscape( + "Not a valid Unicode-escaped character", + ))?; + + EscapeCharacter::Utf8(c) } + _ => return Err(Error::InvalidEscape("Unknown escape character")), }; Ok(c) @@ -1159,7 +1348,7 @@ impl<'a> Bytes<'a> { Ok(Some(Comment::Block)) } - b => Err(Error::UnexpectedByte(b as char)), + b => Err(Error::UnexpectedByte(b)), } } else { Ok(None) @@ -1397,12 +1586,6 @@ impl Float for ParsedFloat { } } -#[derive(Clone, Debug)] -pub enum ParsedStr<'a> { - Allocated(String), - Slice(&'a str), -} - pub enum StructType { NewtypeOrTuple, Tuple, @@ -1420,6 +1603,51 @@ pub enum TupleMode { DifferentiateNewtype, } +#[derive(Clone)] +pub enum ParsedStr<'a> { + Allocated(String), + Slice(&'a str), +} + +pub enum ParsedByteStr<'a> { + Allocated(Vec), + Slice(&'a [u8]), +} + +impl<'a> ParsedStr<'a> { + pub fn try_from_bytes(bytes: ParsedByteStr<'a>) -> Result { + match bytes { + ParsedByteStr::Allocated(byte_buf) => Ok(ParsedStr::Allocated( + String::from_utf8(byte_buf).map_err(|e| e.utf8_error())?, + )), + ParsedByteStr::Slice(bytes) => Ok(ParsedStr::Slice(from_utf8(bytes)?)), + } + } +} + +impl<'a> ParsedByteStr<'a> { + pub fn try_from_base64(str: ParsedStr<'a>) -> Result { + let base64_str = match &str { + ParsedStr::Allocated(string) => string.as_str(), + ParsedStr::Slice(str) => str, + }; + + base64::engine::Engine::decode(&base64::engine::general_purpose::STANDARD, base64_str) + .map(ParsedByteStr::Allocated) + } +} + +#[derive(Copy, Clone)] // GRCOV_EXCL_LINE +enum EscapeEncoding { + Binary, + Utf8, +} + +enum EscapeCharacter { + Ascii(u8), + Utf8(char), +} + #[cfg(test)] mod tests { use super::*; @@ -1454,4 +1682,40 @@ mod tests { assert_eq!(bytes.bytes(), b"24 "); assert_eq!(bytes.pre_ws_bytes(), b" /*bye*/ 24 "); } + + #[test] + fn v0_10_base64_deprecation_error() { + let err = crate::from_str::("\"SGVsbG8gcm9uIQ==\"").unwrap_err(); + + assert_eq!( + err, + SpannedError { + code: Error::InvalidValueForType { + expected: String::from("the Rusty byte string b\"Hello ron!\""), + found: String::from("the ambiguous base64 string \"SGVsbG8gcm9uIQ==\"") + }, + position: Position { line: 1, col: 19 }, + } + ); + + let err = crate::from_str::("r\"SGVsbG8gcm9uIQ==\"").unwrap_err(); + + assert_eq!(format!("{}", err.code), "Expected the Rusty byte string b\"Hello ron!\" but found the ambiguous base64 string \"SGVsbG8gcm9uIQ==\" instead"); + + assert_eq!( + crate::from_str::("\"invalid=\"").unwrap_err(), + SpannedError { + code: Error::ExpectedByteString, + position: Position { line: 1, col: 11 }, + } + ); + + assert_eq!( + crate::from_str::("r\"invalid=\"").unwrap_err(), + SpannedError { + code: Error::ExpectedByteString, + position: Position { line: 1, col: 12 }, + } + ); + } } diff --git a/src/ser/mod.rs b/src/ser/mod.rs index babb2d1e..4fbe60db 100644 --- a/src/ser/mod.rs +++ b/src/ser/mod.rs @@ -1,6 +1,5 @@ use std::io; -use base64::Engine; use serde::{ser, ser::Serialize}; use serde_derive::{Deserialize, Serialize}; @@ -8,10 +7,7 @@ use crate::{ error::{Error, Result}, extensions::Extensions, options::Options, - parse::{ - is_ident_first_char, is_ident_other_char, is_ident_raw_char, LargeSInt, LargeUInt, - BASE64_ENGINE, - }, + parse::{is_ident_first_char, is_ident_other_char, is_ident_raw_char, LargeSInt, LargeUInt}, }; mod raw; @@ -540,6 +536,37 @@ impl Serializer { Ok(()) } + fn serialize_escaped_byte_str(&mut self, value: &[u8]) -> io::Result<()> { + self.output.write_all(b"b\"")?; + for c in value.iter().flat_map(|c| std::ascii::escape_default(*c)) { + self.output.write_all(&[c])?; + } + self.output.write_all(b"\"")?; + Ok(()) + } + + fn serialize_unescaped_or_raw_byte_str(&mut self, value: &[u8]) -> io::Result<()> { + if value.contains(&b'"') || value.contains(&b'\\') { + let (_, num_consecutive_hashes) = + value.iter().fold((0, 0), |(count, max), c| match c { + b'#' => (count + 1, max.max(count + 1)), + _ => (0_usize, max), + }); + let hashes = vec![b'#'; num_consecutive_hashes + 1]; + self.output.write_all(b"br")?; + self.output.write_all(&hashes)?; + self.output.write_all(b"\"")?; + self.output.write_all(value)?; + self.output.write_all(b"\"")?; + self.output.write_all(&hashes)?; + } else { + self.output.write_all(b"b\"")?; + self.output.write_all(value)?; + self.output.write_all(b"\"")?; + } + Ok(()) + } + fn serialize_sint(&mut self, value: impl Into, suffix: &str) -> Result<()> { // TODO optimize write!(self.output, "{}", value.into())?; @@ -729,7 +756,14 @@ impl<'a, W: io::Write> ser::Serializer for &'a mut Serializer { } fn serialize_bytes(self, v: &[u8]) -> Result<()> { - self.serialize_str(BASE64_ENGINE.encode(v).as_str()) + // We need to fall back to escaping if the byte string would be invalid UTF-8 + if self.escape_strings() || std::str::from_utf8(v).is_err() { + self.serialize_escaped_byte_str(v)?; + } else { + self.serialize_unescaped_or_raw_byte_str(v)?; + } + + Ok(()) } fn serialize_none(self) -> Result<()> { diff --git a/src/ser/tests.rs b/src/ser/tests.rs index 37b53ebf..e9acaec1 100644 --- a/src/ser/tests.rs +++ b/src/ser/tests.rs @@ -123,13 +123,27 @@ fn test_byte_stream() { "(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)" ); + let large = vec![0x01, 0x02, 0x03, 0x04]; + let large = serde_bytes::Bytes::new(&large); + assert_eq!(to_string(&large).unwrap(), "b\"\\x01\\x02\\x03\\x04\""); + + let large = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06]; + let large = serde_bytes::Bytes::new(&large); + assert_eq!( + to_string(&large).unwrap(), + "b\"\\x01\\x02\\x03\\x04\\x05\\x06\"" + ); + let large = vec![255u8; 64]; let large = serde_bytes::Bytes::new(&large); assert_eq!( to_string(&large).unwrap(), concat!( - "\"/////////////////////////////////////////", - "////////////////////////////////////////////w==\"" + "b\"\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff", + "\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff", + "\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff", + "\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff", + "\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\"" ) ); } diff --git a/src/ser/value.rs b/src/ser/value.rs index e39ae941..87fdbce8 100644 --- a/src/ser/value.rs +++ b/src/ser/value.rs @@ -15,6 +15,7 @@ impl Serialize for Value { Value::Option(Some(ref o)) => serializer.serialize_some(o.as_ref()), Value::Option(None) => serializer.serialize_none(), Value::String(ref s) => serializer.serialize_str(s), + Value::Bytes(ref b) => serializer.serialize_bytes(b), Value::Seq(ref s) => Serialize::serialize(s, serializer), Value::Unit => serializer.serialize_unit(), } diff --git a/src/value/mod.rs b/src/value/mod.rs index c0b5d2e3..85188c40 100644 --- a/src/value/mod.rs +++ b/src/value/mod.rs @@ -25,6 +25,7 @@ pub enum Value { Number(Number), Option(Option>), String(String), + Bytes(Vec), Seq(Vec), Unit, } @@ -86,6 +87,7 @@ impl<'de> Deserializer<'de> for Value { Value::Option(Some(o)) => visitor.visit_some(*o), Value::Option(None) => visitor.visit_none(), Value::String(s) => visitor.visit_string(s), + Value::Bytes(b) => visitor.visit_byte_buf(b), Value::Seq(mut seq) => { let old_len = seq.len(); @@ -186,6 +188,19 @@ mod tests { assert_eq!(direct, value, "Deserialization for {:?} is not the same", s); } + fn assert_same_bytes<'de, T>(s: &'de [u8]) + where + T: Debug + Deserialize<'de> + PartialEq, + { + use crate::de::from_bytes; + + let direct: T = from_bytes(s).unwrap(); + let value: Value = from_bytes(s).unwrap(); + let value = T::deserialize(value).unwrap(); + + assert_eq!(direct, value, "Deserialization for {:?} is not the same", s); + } + #[test] fn boolean() { assert_same::("true"); @@ -210,6 +225,24 @@ mod tests { assert_same::("'c'"); } + #[test] + fn string() { + assert_same::(r#""hello world""#); + assert_same::(r#""this is a Rusty 🦀 string""#); + assert_same::(r#""this is now valid UTF-8 \xf0\x9f\xa6\x80""#); + } + + #[test] + fn bytes() { + assert_same_bytes::(br#"b"hello world""#); + assert_same_bytes::( + b"b\"this is not valid UTF-8 \xf8\xa1\xa1\xa1\xa1\"", + ); + assert_same_bytes::( + br#"b"this is not valid UTF-8 \xf8\xa1\xa1\xa1\xa1""#, + ); + } + #[test] fn map() { assert_same::>( diff --git a/tests/407_raw_value.rs b/tests/407_raw_value.rs index efa50d8e..6e91c621 100644 --- a/tests/407_raw_value.rs +++ b/tests/407_raw_value.rs @@ -55,7 +55,7 @@ fn test_raw_value_invalid() { assert_eq!( err, SpannedError { - code: Error::UnexpectedByte('\0'), + code: Error::UnexpectedByte(b'\0'), position: Position { line: 1, col: 1 } } ) diff --git a/tests/436_untagged_bytes.rs b/tests/436_untagged_bytes.rs new file mode 100644 index 00000000..c4d6bdda --- /dev/null +++ b/tests/436_untagged_bytes.rs @@ -0,0 +1,102 @@ +#[test] +fn test_serde_bytes() { + #[derive(Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + #[serde(rename = "b")] + struct BytesVal { + pub b: serde_bytes::ByteBuf, + } + + #[derive(Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + #[serde(untagged)] + enum Bad { + Bytes(BytesVal), + } + + let s = ron::to_string(&serde_bytes::Bytes::new(b"test")).unwrap(); + + assert_eq!(s, r#"b"test""#); + + let v: Bad = ron::from_str(r#"(b: b"test")"#).unwrap(); + + assert_eq!( + format!("{:?}", v), + "Bytes(BytesVal { b: [116, 101, 115, 116] })" + ); + + let s = ron::to_string(&v).unwrap(); + + assert_eq!(s, r#"(b:b"test")"#); +} + +#[test] +fn test_bytes() { + #[derive(Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + #[serde(rename = "b")] + struct BytesVal { + pub b: bytes::Bytes, + } + + #[derive(Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + #[serde(untagged)] + enum Bad { + Bytes(BytesVal), + } + + let s = ron::to_string(&bytes::Bytes::from("test")).unwrap(); + + assert_eq!(s, r#"b"test""#); + + let v: Bad = ron::from_str(r#"(b: b"test")"#).unwrap(); + + assert_eq!(format!("{:?}", v), r#"Bytes(BytesVal { b: b"test" })"#); + + let s = ron::to_string(&v).unwrap(); + + assert_eq!(s, r#"(b:b"test")"#); +} + +#[test] +fn test_strongly_typed_base64() { + use base64::engine::{general_purpose::STANDARD as BASE64, Engine}; + + enum Base64 {} + + impl Base64 { + fn serialize(data: &[u8], serializer: S) -> Result { + serializer.serialize_str(&BASE64.encode(data)) + } + + fn deserialize<'de, D: serde::Deserializer<'de>>( + deserializer: D, + ) -> Result, D::Error> { + let base64_str: &str = serde::Deserialize::deserialize(deserializer)?; + BASE64.decode(base64_str).map_err(serde::de::Error::custom) + } + } + + #[derive(Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + #[serde(rename = "b")] + struct BytesVal { + #[serde(with = "Base64")] + pub b: Vec, + } + + #[derive(Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + #[serde(untagged)] + enum Bad { + Bytes(BytesVal), + } + + let v: Bad = ron::from_str(r#"(b: "dGVzdA==")"#).unwrap(); + + assert_eq!( + v, + Bad::Bytes(BytesVal { + b: b"test".to_vec() + }) + ); + + let s = ron::to_string(&v).unwrap(); + + assert_eq!(s, r#"(b:"dGVzdA==")"#); +} diff --git a/tests/438_rusty_byte_strings.rs b/tests/438_rusty_byte_strings.rs new file mode 100644 index 00000000..5dd26dae --- /dev/null +++ b/tests/438_rusty_byte_strings.rs @@ -0,0 +1,450 @@ +use ron::{ + error::{Position, SpannedError}, + Error, +}; +use serde::Deserialize; + +#[derive(Debug, Deserialize, PartialEq)] +struct BytesStruct { + small: Vec, + #[serde(with = "serde_bytes")] + large: Vec, +} + +#[test] +fn v_9_deprecated_base64_bytes_support() { + #![allow(deprecated)] + + // Requires padding of the base64 data + assert_eq!( + Ok(BytesStruct { + small: vec![1, 2], + large: vec![1, 2, 3, 4] + }), + ron::from_str("BytesStruct( small:[1, 2], large:\"AQIDBA==\" )"), + ); + + // Requires no padding of the base64 data + assert_eq!( + Ok(BytesStruct { + small: vec![1, 2], + large: vec![1, 2, 3, 4, 5, 6] + }), + ron::from_str("BytesStruct( small:[1, 2], large:r\"AQIDBAUG\" )"), + ); + + // Parsing an escaped byte string is also possible + assert_eq!( + Ok(BytesStruct { + small: vec![1, 2], + large: vec![1, 2, 3, 4, 5, 6] + }), + ron::from_str("BytesStruct( small:[1, 2], large:\"\\x41Q\\x49DBA\\x55G\" )"), + ); + + // Invalid base64 + assert_eq!( + Err(SpannedError { + code: Error::Base64Error(base64::DecodeError::InvalidByte(0, b'_')), + position: Position { line: 1, col: 40 } + }), + ron::from_str::("BytesStruct( small:[1, 2], large:\"_+!!\" )"), + ); + + // Invalid last base64 symbol + assert_eq!( + Err(SpannedError { + code: Error::Base64Error(base64::DecodeError::InvalidLastSymbol(1, b'x')), + position: Position { line: 1, col: 40 } + }), + ron::from_str::("BytesStruct( small:[1, 2], large:\"/x==\" )"), + ); + + // Missing padding + assert_eq!( + Err(SpannedError { + code: Error::Base64Error(base64::DecodeError::InvalidPadding), + position: Position { line: 1, col: 42 } + }), + ron::from_str::("BytesStruct( small:[1, 2], large:\"AQIDBA\" )"), + ); + + // Too much padding + assert_eq!( + Err(SpannedError { + code: Error::Base64Error(base64::DecodeError::InvalidLength), + position: Position { line: 1, col: 45 } + }), + ron::from_str::("BytesStruct( small:[1, 2], large:\"AQIDBA===\" )"), + ); +} + +#[test] +fn rusty_byte_string() { + assert_eq!( + Ok(BytesStruct { + small: vec![1, 2], + large: vec![1, 2, 0, 4] + }), + ron::from_str("BytesStruct( small:[1, 2], large: b\"\\x01\\u{2}\\0\\x04\" )"), + ); + + assert_eq!( + ron::from_str::("\"Hello \\x01 \\u{2}!\"").unwrap(), + "Hello \x01 \u{2}!", + ); + assert_eq!( + &*ron::from_str::("b\"Hello \\x01 \\u{2}!\"").unwrap(), + b"Hello \x01 \x02!", + ); + + rusty_byte_string_roundtrip(b"hello", "b\"hello\"", "b\"hello\""); + rusty_byte_string_roundtrip(b"\"", "b\"\\\"\"", "br#\"\"\"#"); + rusty_byte_string_roundtrip(b"\"#", "b\"\\\"#\"", "br##\"\"#\"##"); + rusty_byte_string_roundtrip(b"\n", "b\"\\n\"", "b\"\n\""); + + assert_eq!( + ron::from_str::("b\"\\xf\"").unwrap_err(), + SpannedError { + code: Error::InvalidEscape("Non-hex digit found"), + position: Position { line: 1, col: 7 }, + }, + ); + let err = ron::from_str::("br#q\"").unwrap_err(); + assert_eq!( + err, + SpannedError { + code: Error::ExpectedByteString, + position: Position { line: 1, col: 4 }, + }, + ); + assert_eq!(format!("{}", err.code), "Expected byte string",); + assert_eq!( + ron::from_str::("br#\"q").unwrap_err(), + SpannedError { + code: Error::ExpectedStringEnd, + position: Position { line: 1, col: 5 }, + }, + ); + assert_eq!( + ron::from_str::("r#q\"").unwrap_err(), + SpannedError { + code: Error::ExpectedString, + position: Position { line: 1, col: 3 }, + }, + ); + assert_eq!( + ron::from_str::("r#\"q").unwrap_err(), + SpannedError { + code: Error::ExpectedStringEnd, + position: Position { line: 1, col: 4 }, + }, + ); +} + +fn rusty_byte_string_roundtrip(bytes: &[u8], ron: &str, ron_raw: &str) { + let ser_list = ron::to_string(bytes).unwrap(); + let de_list: Vec = ron::from_str(&ser_list).unwrap(); + assert_eq!(de_list, bytes); + + let ser = ron::to_string(&bytes::Bytes::copy_from_slice(bytes)).unwrap(); + assert_eq!(ser, ron); + + let ser_non_raw = ron::ser::to_string_pretty( + &bytes::Bytes::copy_from_slice(bytes), + ron::ser::PrettyConfig::default(), + ) + .unwrap(); + assert_eq!(ser_non_raw, ron); + + let ser_raw = ron::ser::to_string_pretty( + &bytes::Bytes::copy_from_slice(bytes), + ron::ser::PrettyConfig::default().escape_strings(false), + ) + .unwrap(); + assert_eq!(ser_raw, ron_raw); + + let de: bytes::Bytes = ron::from_str(&ser).unwrap(); + assert_eq!(de, bytes); + + let de_raw: bytes::Bytes = ron::from_str(&ser_raw).unwrap(); + assert_eq!(de_raw, bytes); +} + +#[test] +fn fuzzer_failures() { + assert_eq!( + ron::to_string(&bytes::Bytes::copy_from_slice(&[ + 123, 0, 0, 0, 0, 214, 214, 214, 214, 214 + ])) + .unwrap(), + r#"b"{\x00\x00\x00\x00\xd6\xd6\xd6\xd6\xd6""# + ); + // Need to fall back to escaping so no invalid UTF-8 is produced + assert_eq!( + ron::ser::to_string_pretty( + &bytes::Bytes::copy_from_slice(&[123, 0, 0, 0, 0, 214, 214, 214, 214, 214]), + ron::ser::PrettyConfig::default().escape_strings(false) + ) + .unwrap(), + r#"b"{\x00\x00\x00\x00\xd6\xd6\xd6\xd6\xd6""# + ); + + assert_eq!( + ron::to_string(&bytes::Bytes::copy_from_slice(&[123, 0, 0, 0, 0])).unwrap(), + r#"b"{\x00\x00\x00\x00""# + ); + assert_eq!( + ron::ser::to_string_pretty( + &bytes::Bytes::copy_from_slice(&[123, 0, 0, 0, 0]), + ron::ser::PrettyConfig::default().escape_strings(false) + ) + .unwrap(), + "b\"{\x00\x00\x00\x00\"" + ); + + // `br#` should be parsed as the start of a byte string, not the identifier `br` and a `#` + assert_eq!( + ron::from_str(r##"br#"""#"##), + Ok(ron::Value::Bytes(vec![34])) + ); + assert_eq!( + ron::from_str(r##"{"error": br#"""#}"##), + Ok(ron::Value::Map( + [( + ron::Value::String(String::from("error")), + ron::Value::Bytes(vec![34]) + )] + .into_iter() + .collect() + )) + ); + assert_eq!( + ron::from_str( + r##"#![enable(unwrap_newtypes)] + #![enable(unwrap_variant_newtypes)] + Some({"error": br#"""#}) + "## + ), + Ok(ron::Value::Option(Some(Box::new(ron::Value::Map( + [( + ron::Value::String(String::from("error")), + ron::Value::Bytes(vec![34]) + )] + .into_iter() + .collect() + ))))) + ); + + // `br"` should be parsed as the start of a byte string, not the identifier `br` and a `"` + assert_eq!(ron::from_str(r#"br"""#), Ok(ron::Value::Bytes(vec![]))); + assert_eq!( + ron::from_str(r#"{"error": br""}"#), + Ok(ron::Value::Map( + [( + ron::Value::String(String::from("error")), + ron::Value::Bytes(vec![]) + )] + .into_iter() + .collect() + )) + ); + assert_eq!( + ron::from_str( + r#"#![enable(unwrap_newtypes)] + #![enable(unwrap_variant_newtypes)] + Some({"error": br""}) + "# + ), + Ok(ron::Value::Option(Some(Box::new(ron::Value::Map( + [( + ron::Value::String(String::from("error")), + ron::Value::Bytes(vec![]) + )] + .into_iter() + .collect() + ))))) + ); + + // Test that the struct type check for newtype variant unwrapping does + // not enter inside a byte string to find a bad comment start + assert_eq!( + ron::from_str::>( + r#"#![enable(unwrap_variant_newtypes)] Some(b"\xff/not a comment")"# + ) + .unwrap(), + Some(ron::Value::Bytes(b"\xff/not a comment".to_vec())) + ); + + // `b'` should be parsed as the start of a byte literal, not the identifier `b` and a `'` + assert_eq!( + ron::from_str(r"b'\xff'"), + Ok(ron::Value::Number(ron::value::Number::U8(b'\xff'))) + ); + + // `b`, `br`, `bq`, and `brq` should all be parsed as identifiers + for id in ["b", "br", "bq", "brq"] { + assert_eq!(ron::from_str(id), Ok(ron::Value::Unit)); + } +} + +#[test] +fn serialize_backslash_byte_string() { + check_roundtrip('\\', r"'\\'", r"'\\'"); + check_roundtrip( + bytes::Bytes::copy_from_slice(b"\\"), + r#"b"\\""#, + "br#\"\\\"#", + ); +} + +fn check_roundtrip< + T: PartialEq + std::fmt::Debug + serde::Serialize + serde::de::DeserializeOwned, +>( + val: T, + cmp: &str, + cmp_raw: &str, +) { + let ron = ron::to_string(&val).unwrap(); + assert_eq!(ron, cmp); + + let ron_escaped = + ron::ser::to_string_pretty(&val, ron::ser::PrettyConfig::default().escape_strings(true)) + .unwrap(); + assert_eq!(ron_escaped, cmp); + + let ron_raw = ron::ser::to_string_pretty( + &val, + ron::ser::PrettyConfig::default().escape_strings(false), + ) + .unwrap(); + assert_eq!(ron_raw, cmp_raw); + + let de = ron::from_str::(&ron).unwrap(); + assert_eq!(de, val); + + let de_raw = ron::from_str::(&ron_raw).unwrap(); + assert_eq!(de_raw, val); +} + +#[test] +fn test_weird_escapes() { + assert_eq!( + ron::from_str::(r#""\u{1F980}""#), + Ok(String::from("\u{1F980}")) + ); + assert_eq!( + ron::from_str::(r#"b"\xf0\x9f\xa6\x80""#), + Ok(bytes::Bytes::copy_from_slice("\u{1F980}".as_bytes())) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f\xa6\x80""#), + Ok(String::from("\u{1F980}")) + ); + assert_eq!( + ron::from_str::(r#""\xf0""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 6 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 10 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f\x40""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 14 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f\xa6""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 14 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xff\xff\xff\xff""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 18 } + }) + ); + + assert_eq!(ron::from_str::(r"'\u{1F980}'"), Ok('\u{1F980}')); + assert_eq!( + ron::from_str::(r"'\xf0\x9f\xa6\x80'"), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 6 } + }) + ); +} + +#[test] +fn byte_literal() { + assert_eq!( + ron::from_str("b'\0'"), + Ok(ron::Value::Number(ron::value::Number::U8(0))) + ); + assert_eq!( + ron::from_str("b'\\0'"), + Ok(ron::Value::Number(ron::value::Number::U8(0))) + ); + + for b in 0..=255_u8 { + let default = std::ascii::escape_default(b) + .map(char::from) + .collect::(); + let lower = format!(r"\x{:02x}", b); + let upper = format!(r"\x{:02X}", b); + + assert_eq!( + ron::from_str(&format!("b'{}'", default)), + Ok(ron::Value::Number(ron::value::Number::U8(b))) + ); + assert_eq!( + ron::from_str(&format!("b'{}'", lower)), + Ok(ron::Value::Number(ron::value::Number::U8(b))) + ); + assert_eq!( + ron::from_str(&format!("b'{}'", upper)), + Ok(ron::Value::Number(ron::value::Number::U8(b))) + ); + } + + assert_eq!( + ron::from_str::(r#"b'\u{0}'"#), + Err(SpannedError { + code: Error::InvalidEscape("Unexpected Unicode escape in byte literal"), + position: Position { line: 1, col: 8 }, + }) + ); + + let err = ron::from_str::(r#"b'🦀'"#).unwrap_err(); + assert_eq!( + err, + SpannedError { + code: Error::ExpectedByteLiteral, + position: Position { line: 1, col: 4 }, + } + ); + assert_eq!(format!("{}", err.code), "Expected byte literal"); + + assert_eq!( + ron::from_str::(r#"b'9'"#), + Err(SpannedError { + code: Error::InvalidValueForType { + expected: String::from("an 8-bit signed integer"), + found: String::from(r#"b'9'"#) + }, + position: Position { line: 1, col: 5 }, + }) + ); +} diff --git a/tests/462_bytes.rs b/tests/462_bytes.rs index ac918903..832dc6a1 100644 --- a/tests/462_bytes.rs +++ b/tests/462_bytes.rs @@ -14,7 +14,7 @@ fn test_deserialise_byte_slice() { Err(ron::error::SpannedError { code: ron::error::Error::InvalidValueForType { expected: String::from("a borrowed byte array"), - found: String::from("the bytes \"AAECAw==\""), + found: String::from("the byte string b\"\\x00\\x01\\x02\\x03\""), }, position: ron::error::Position { line: 1, col: 10 }, }) diff --git a/tests/465_ser_backslash_string.rs b/tests/465_ser_backslash_string.rs index 7b19fd4a..5ca35450 100644 --- a/tests/465_ser_backslash_string.rs +++ b/tests/465_ser_backslash_string.rs @@ -1,21 +1,34 @@ #[test] fn serialize_backslash_string() { - assert_eq!(ron::to_string(&'\\').unwrap(), r"'\\'"); - assert_eq!(ron::to_string(&"\\").unwrap(), r#""\\""#); - assert_eq!( - ron::ser::to_string_pretty( - &"\\", - ron::ser::PrettyConfig::default().escape_strings(true) - ) - .unwrap(), - r#""\\""# - ); - assert_eq!( - ron::ser::to_string_pretty( - &"\\", - ron::ser::PrettyConfig::default().escape_strings(false) - ) - .unwrap(), - "r#\"\\\"#" - ); + check_roundtrip('\\', r"'\\'", r"'\\'"); + check_roundtrip(String::from("\\"), r#""\\""#, "r#\"\\\"#"); +} + +fn check_roundtrip< + T: PartialEq + std::fmt::Debug + serde::Serialize + serde::de::DeserializeOwned, +>( + val: T, + cmp: &str, + cmp_raw: &str, +) { + let ron = ron::to_string(&val).unwrap(); + assert_eq!(ron, cmp); + + let ron_escaped = + ron::ser::to_string_pretty(&val, ron::ser::PrettyConfig::default().escape_strings(true)) + .unwrap(); + assert_eq!(ron_escaped, cmp); + + let ron_raw = ron::ser::to_string_pretty( + &val, + ron::ser::PrettyConfig::default().escape_strings(false), + ) + .unwrap(); + assert_eq!(ron_raw, cmp_raw); + + let de = ron::from_str::(&ron).unwrap(); + assert_eq!(de, val); + + let de_raw = ron::from_str::(&ron_raw).unwrap(); + assert_eq!(de_raw, val); } diff --git a/tests/comments.rs b/tests/comments.rs index 2b77de61..fcbc41b7 100644 --- a/tests/comments.rs +++ b/tests/comments.rs @@ -50,3 +50,14 @@ fn test_unclosed() { }) ); } + +#[test] +fn test_unexpected_byte() { + assert_eq!( + from_str::("42 /q"), + Err(RonErr { + code: Error::UnexpectedByte(b'q'), + position: Position { line: 1, col: 6 }, + }) + ); +} diff --git a/tests/escape.rs b/tests/escape.rs index 3c3c82da..62dd26ff 100644 --- a/tests/escape.rs +++ b/tests/escape.rs @@ -12,6 +12,14 @@ fn test_escape_basic() { assert_eq!(from_str::("\'\\x07\'").unwrap(), '\x07'); assert_eq!(from_str::("\'\\u{7}\'").unwrap(), '\x07'); + + assert_eq!( + from_str::("\'\\q\'").unwrap_err(), + ron::error::SpannedError { + code: ron::Error::InvalidEscape("Unknown escape character"), + position: ron::error::Position { line: 1, col: 4 }, + } + ) } fn check_same(t: T) diff --git a/tests/value.rs b/tests/value.rs index fdc3b22a..9b57a4d9 100644 --- a/tests/value.rs +++ b/tests/value.rs @@ -10,11 +10,16 @@ use serde_derive::{Deserialize, Serialize}; fn bool() { assert_eq!("true".parse(), Ok(Value::Bool(true))); assert_eq!("false".parse(), Ok(Value::Bool(false))); + + assert_eq!(ron::to_string(&Value::Bool(true)).unwrap(), "true"); + assert_eq!(ron::to_string(&Value::Bool(false)).unwrap(), "false"); } #[test] fn char() { assert_eq!("'a'".parse(), Ok(Value::Char('a'))); + + assert_eq!(ron::to_string(&Value::Char('a')).unwrap(), "'a'"); } #[test] @@ -22,7 +27,11 @@ fn map() { let mut map = Map::new(); map.insert(Value::Char('a'), Value::Number(Number::U8(1))); map.insert(Value::Char('b'), Value::Number(Number::new(2f32))); - assert_eq!("{ 'a': 1, 'b': 2.0 }".parse(), Ok(Value::Map(map))); + let map = Value::Map(map); + + assert_eq!(ron::to_string(&map).unwrap(), "{'a':1,'b':2.0}"); + + assert_eq!("{ 'a': 1, 'b': 2.0 }".parse(), Ok(map)); } #[test] @@ -32,18 +41,38 @@ fn number() { "3.141592653589793".parse(), Ok(Value::Number(Number::new(f64::consts::PI))) ); + + assert_eq!( + ron::to_string(&Value::Number(Number::U8(42))).unwrap(), + "42" + ); + assert_eq!( + ron::to_string(&Value::Number(Number::F64(f64::consts::PI.into()))).unwrap(), + "3.141592653589793" + ); } #[test] fn option() { let opt = Some(Box::new(Value::Char('c'))); assert_eq!("Some('c')".parse(), Ok(Value::Option(opt))); + assert_eq!("None".parse(), Ok(Value::Option(None))); + + assert_eq!( + ron::to_string(&Value::Option(Some(Box::new(Value::Char('c'))))).unwrap(), + "Some('c')" + ); + assert_eq!(ron::to_string(&Value::Option(None)).unwrap(), "None"); } #[test] fn string() { let normal = "\"String\""; assert_eq!(normal.parse(), Ok(Value::String("String".into()))); + assert_eq!( + ron::to_string(&Value::String("String".into())).unwrap(), + "\"String\"" + ); let raw = "r\"Raw String\""; assert_eq!(raw.parse(), Ok(Value::String("Raw String".into()))); @@ -62,15 +91,32 @@ fn string() { raw_multi_line.parse(), Ok(Value::String("Multi\nLine".into())) ); + assert_eq!( + ron::to_string(&Value::String("Multi\nLine".into())).unwrap(), + "\"Multi\\nLine\"" + ); +} + +#[test] +fn byte_string() { + assert_eq!( + "b\"\\x01\\u{2}\\0\\x04\"".parse(), + Ok(Value::Bytes(vec![1, 2, 0, 4])) + ); + assert_eq!( + ron::to_string(&Value::Bytes(vec![1, 2, 0, 4])).unwrap(), + "b\"\\x01\\x02\\x00\\x04\"" + ); } #[test] fn seq() { - let seq = vec![ + let seq = Value::Seq(vec![ Value::Number(Number::U8(1)), Value::Number(Number::new(2f32)), - ]; - assert_eq!("[1, 2.0]".parse(), Ok(Value::Seq(seq))); + ]); + assert_eq!(ron::to_string(&seq).unwrap(), "[1,2.0]"); + assert_eq!("[1, 2.0]".parse(), Ok(seq)); let err = Value::Seq(vec![Value::Number(Number::new(1))]) .into_rust::<[i32; 2]>() @@ -115,6 +161,8 @@ fn unit() { position: Position { col: 1, line: 1 } }) ); + + assert_eq!(ron::to_string(&Value::Unit).unwrap(), "()"); } #[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]