From 304ce25446811af8e8a088984bad1a4f46436f84 Mon Sep 17 00:00:00 2001 From: leaysgur <6259812+leaysgur@users.noreply.github.com> Date: Wed, 11 Sep 2024 07:07:00 +0000 Subject: [PATCH] fix(regular_expression): Keep LegacyOctalEscape raw digits for `to_string` (#5692) Fixes #5690 - Update `CharacterKind` enum from `Octal` to `Octal1`, `Octal2` and `Octal3` - Stylistic refactoring for `impl Display` --- crates/oxc_regular_expression/src/ast.rs | 11 +- .../src/body_parser/parser.rs | 15 +- crates/oxc_regular_expression/src/display.rs | 170 ++++++++++-------- .../src/generated/derive_clone_in.rs | 4 +- tasks/coverage/parser_test262.snap | 5 +- tasks/coverage/semantic_test262.snap | 5 +- 6 files changed, 121 insertions(+), 89 deletions(-) diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index c7740441e8c69..acb339be6c059 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -174,10 +174,13 @@ pub enum CharacterKind { HexadecimalEscape = 1, Identifier = 2, Null = 3, - Octal = 4, - SingleEscape = 5, - Symbol = 6, - UnicodeEscape = 7, + // To distinguish leading 0 cases like `\00` and `\000` + Octal1 = 4, + Octal2 = 5, + Octal3 = 6, + SingleEscape = 7, + Symbol = 8, + UnicodeEscape = 9, } /// Character class. diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index 2fff63b5a42a9..6e68b6788ed72 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -691,12 +691,21 @@ impl<'a> PatternParser<'a> { })); } - // e.g. \18 + // e.g. \1, \00, \000 if !self.state.unicode_mode { if let Some(cp) = self.consume_legacy_octal_escape_sequence() { + let span_end = self.reader.offset(); + // Keep original digits for `to_string()` + // Otherwise `\0022`(octal \002 + symbol 2) will be `\22`(octal \22) + let digits = span_end - span_start - 1; // -1 for '\' + return Ok(Some(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Octal, + span: self.span_factory.create(span_start, span_end), + kind: (match digits { + 3 => ast::CharacterKind::Octal3, + 2 => ast::CharacterKind::Octal2, + _ => ast::CharacterKind::Octal1, + }), value: cp, })); } diff --git a/crates/oxc_regular_expression/src/display.rs b/crates/oxc_regular_expression/src/display.rs index e0efc11b4627f..ce790dc4524f6 100644 --- a/crates/oxc_regular_expression/src/display.rs +++ b/crates/oxc_regular_expression/src/display.rs @@ -16,24 +16,23 @@ impl<'a> Display for RegularExpression<'a> { impl Display for Flags { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut flags = String::with_capacity(8); - macro_rules! if_true_append { - ($flag:ident, $char:literal) => { - if self.$flag { - flags.push($char); - } - }; - } // write flags in the order they are described in the `MDN` // - if_true_append!(has_indices, 'd'); - if_true_append!(global, 'g'); - if_true_append!(ignore_case, 'i'); - if_true_append!(multiline, 'm'); - if_true_append!(dot_all, 's'); - if_true_append!(unicode, 'u'); - if_true_append!(unicode_sets, 'v'); - if_true_append!(sticky, 'y'); + for (v, ch) in [ + (self.has_indices, 'd'), + (self.global, 'g'), + (self.ignore_case, 'i'), + (self.multiline, 'm'), + (self.dot_all, 's'), + (self.unicode, 'u'), + (self.unicode_sets, 'v'), + (self.sticky, 'y'), + ] { + if v { + flags.push(ch); + } + } write!(f, "{flags}") } @@ -60,14 +59,17 @@ impl<'a> Display for Alternative<'a> { None } } + write_join_with(f, "", &self.body, |iter| { let next = iter.next()?; let Some(next) = as_character(next) else { return Some(next.to_string()) }; + let peek = iter.peek().and_then(|it| as_character(it)); let (result, eat) = character_to_string(next, peek); if eat { - _ = iter.next(); + iter.next(); } + Some(result) }) } @@ -208,25 +210,30 @@ impl<'a> Display for CharacterClass<'a> { None } } + write!(f, "[")?; if !self.body.is_empty() { if self.negative { write!(f, "^")?; } + let sep = match self.kind { CharacterClassContentsKind::Union => "", CharacterClassContentsKind::Subtraction => "--", CharacterClassContentsKind::Intersection => "&&", }; + write_join_with(f, sep, &self.body, |iter| { let next = iter.next()?; let Some(next) = as_character(next) else { return Some(next.to_string()) }; + let peek = iter.peek().and_then(|it| as_character(it)); let (result, eat) = character_to_string(next, peek); if eat { - _ = iter.next(); + iter.next(); } + Some(result) })?; } @@ -270,12 +277,14 @@ impl<'a> Display for ClassString<'a> { impl<'a> Display for CapturingGroup<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let body = &self.body; + write!(f, "(")?; + if let Some(name) = &self.name { - write!(f, "(?<{name}>{body})") - } else { - write!(f, "({body})") + write!(f, "?<{name}>")?; } + write!(f, "{}", &self.body)?; + + write!(f, ")") } } @@ -299,12 +308,14 @@ impl<'a> Display for IgnoreGroup<'a> { } write!(f, "(?")?; + if let Some(enabling) = &self.enabling_modifiers { write_flags(f, '\0', enabling)?; } if let Some(disabling) = &self.disabling_modifiers { write_flags(f, '-', disabling)?; } + write!(f, ":{})", self.body) } } @@ -321,35 +332,7 @@ impl<'a> Display for NamedReference<'a> { } } -fn write_join(f: &mut fmt::Formatter<'_>, sep: S, items: I) -> fmt::Result -where - S: AsRef, - E: Display, - I: IntoIterator, -{ - write_join_with(f, sep, items, |iter| iter.next().map(|it| it.to_string())) -} - -fn write_join_with(f: &mut fmt::Formatter<'_>, sep: S, items: I, next: F) -> fmt::Result -where - S: AsRef, - E: Display, - I: IntoIterator, - F: Fn(&mut Peekable) -> Option, -{ - let sep = sep.as_ref(); - let iter = &mut items.into_iter().peekable(); - - if let Some(first) = next(iter) { - write!(f, "{first}")?; - } - - while let Some(it) = next(iter) { - write!(f, "{sep}{it}")?; - } - - Ok(()) -} +// --- fn character_to_string( this: &Character, @@ -378,6 +361,8 @@ fn character_to_string( let ch = char::from_u32(cp).expect("Invalid `Character`!"); let result = match this.kind { + // Not a surrogate, like BMP, or all units in unicode mode + CharacterKind::Symbol => format!("{ch}"), CharacterKind::ControlLetter => match ch { '\n' => r"\cJ".to_string(), '\r' => r"\cM".to_string(), @@ -387,8 +372,16 @@ fn character_to_string( CharacterKind::Identifier => { format!(r"\{ch}") } - // Not a surrogate, like BMP, or all units in unicode mode - CharacterKind::Symbol => format!("{ch}"), + CharacterKind::SingleEscape => match ch { + '\n' => String::from(r"\n"), + '\r' => String::from(r"\r"), + '\t' => String::from(r"\t"), + '\u{b}' => String::from(r"\v"), + '\u{c}' => String::from(r"\f"), + '\u{8}' => String::from(r"\b"), + '\u{2D}' => String::from(r"\-"), + _ => format!(r"\{ch}"), + }, CharacterKind::Null => String::from(r"\0"), CharacterKind::UnicodeEscape => { let hex = &format!("{cp:04X}"); @@ -402,27 +395,58 @@ fn character_to_string( let hex = &format!("{cp:02X}"); format!(r"\x{hex}") } - CharacterKind::Octal => { + CharacterKind::Octal1 => { let octal = format!("{cp:o}"); format!(r"\{octal}") } - CharacterKind::SingleEscape => match ch { - '\n' => String::from(r"\n"), - '\r' => String::from(r"\r"), - '\t' => String::from(r"\t"), - '\u{b}' => String::from(r"\v"), - '\u{c}' => String::from(r"\f"), - '\u{8}' => String::from(r"\b"), - '\u{2D}' => String::from(r"\-"), - _ => format!(r"\{ch}"), - }, + CharacterKind::Octal2 => { + let octal = format!("{cp:02o}"); + format!(r"\{octal}") + } + CharacterKind::Octal3 => { + let octal = format!("{cp:03o}"); + format!(r"\{octal}") + } }; (result, false) } +// --- + +fn write_join(f: &mut fmt::Formatter<'_>, sep: S, items: I) -> fmt::Result +where + S: AsRef, + E: Display, + I: IntoIterator, +{ + write_join_with(f, sep, items, |iter| iter.next().map(|it| it.to_string())) +} + +fn write_join_with(f: &mut fmt::Formatter<'_>, sep: S, items: I, next: F) -> fmt::Result +where + S: AsRef, + E: Display, + I: IntoIterator, + F: Fn(&mut Peekable) -> Option, +{ + let sep = sep.as_ref(); + let iter = &mut items.into_iter().peekable(); + + if let Some(first) = next(iter) { + write!(f, "{first}")?; + } + + while let Some(it) = next(iter) { + write!(f, "{sep}{it}")?; + } + + Ok(()) +} + #[cfg(test)] mod test { + use crate::{Parser, ParserOptions}; use oxc_allocator::Allocator; type Case<'a> = ( @@ -505,23 +529,24 @@ mod test { (r"/\5/", None), (r"/\6/", None), (r"/\7/", None), - // Remove leading zeroes -- - (r"/\00/", Some(r"/\0/")), - (r"/\07/", Some(r"/\7/")), - // -- + (r"/\00/", None), + (r"/\07/", None), + (r"/\30/", None), + (r"/\37/", None), (r"/\40/", None), (r"/\47/", None), (r"/\70/", None), (r"/\77/", None), - // Remove leading zeroes -- - (r"/\000/", Some(r"/\0/")), - (r"/\007/", Some(r"/\7/")), - (r"/\070/", Some(r"/\70/")), - // -- + (r"/\000/", None), + (r"/\007/", None), + (r"/\070/", None), (r"/\300/", None), (r"/\307/", None), (r"/\370/", None), (r"/\377/", None), + (r"/\0111/", None), + (r"/\0022/", None), + (r"/\0003/", None), (r"/(.)\1/", None), // Identity escape from: (r"/\C/", None), @@ -553,7 +578,6 @@ mod test { ]; fn test_display(allocator: &Allocator, (source, expect): &Case) { - use crate::{Parser, ParserOptions}; let expect = expect.unwrap_or(source); let actual = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap(); assert_eq!(expect, actual.to_string()); diff --git a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs index 76890af6bdf89..d41d5daae7fcf 100644 --- a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs +++ b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs @@ -171,7 +171,9 @@ impl<'alloc> CloneIn<'alloc> for CharacterKind { Self::HexadecimalEscape => CharacterKind::HexadecimalEscape, Self::Identifier => CharacterKind::Identifier, Self::Null => CharacterKind::Null, - Self::Octal => CharacterKind::Octal, + Self::Octal1 => CharacterKind::Octal1, + Self::Octal2 => CharacterKind::Octal2, + Self::Octal3 => CharacterKind::Octal3, Self::SingleEscape => CharacterKind::SingleEscape, Self::Symbol => CharacterKind::Symbol, Self::UnicodeEscape => CharacterKind::UnicodeEscape, diff --git a/tasks/coverage/parser_test262.snap b/tasks/coverage/parser_test262.snap index a01087b6d2fa6..190fd4dd3005e 100644 --- a/tasks/coverage/parser_test262.snap +++ b/tasks/coverage/parser_test262.snap @@ -2,11 +2,8 @@ commit: d62fa93c parser_test262 Summary: AST Parsed : 43765/43765 (100.00%) -Positive Passed: 43764/43765 (100.00%) +Positive Passed: 43765/43765 (100.00%) Negative Passed: 4237/4237 (100.00%) -Expect to Parse: tasks/coverage/test262/test/annexB/language/literals/regexp/legacy-octal-escape.js - - × Regular Expression mismatch: \03 \3 × '0'-prefixed octal literals and octal escape sequences are deprecated ╭─[test262/test/annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js:19:4] diff --git a/tasks/coverage/semantic_test262.snap b/tasks/coverage/semantic_test262.snap index 6310120d19f11..900514ade67a0 100644 --- a/tasks/coverage/semantic_test262.snap +++ b/tasks/coverage/semantic_test262.snap @@ -2,7 +2,7 @@ commit: d62fa93c semantic_test262 Summary: AST Parsed : 43765/43765 (100.00%) -Positive Passed: 43564/43765 (99.54%) +Positive Passed: 43565/43765 (99.54%) tasks/coverage/test262/test/annexB/language/function-code/if-decl-else-decl-a-func-block-scoping.js semantic error: Symbol scope ID mismatch: after transform: SymbolId(3): ScopeId(4294967294) @@ -1119,9 +1119,6 @@ semantic error: Symbol scope ID mismatch: after transform: SymbolId(0): ScopeId(4294967294) rebuilt : SymbolId(0): ScopeId(4294967294) -tasks/coverage/test262/test/annexB/language/literals/regexp/legacy-octal-escape.js -semantic error: Regular Expression mismatch: \03 \3 - tasks/coverage/test262/test/language/module-code/eval-rqstd-once.js semantic error: Bindings mismatch: after transform: ScopeId(0): ["dflt1", "dflt2", "dflt3", "global", "ns1", "ns3"]