perf(parser): support peeking over bytes (#4304)

Closes #3291
oxc-project · Jul 30, 2024 · c9c38a1 · c9c38a1
1 parent 732f4e2
commit c9c38a1
Show file tree

Hide file tree

Showing 11 changed files with 116 additions and 76 deletions.
diff --git a/crates/oxc_ast/src/ast_impl/literal.rs b/crates/oxc_ast/src/ast_impl/literal.rs
@@ -108,6 +108,24 @@ impl TryFrom<char> for RegExpFlags {
     }
 }
 
+impl TryFrom<u8> for RegExpFlags {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            b'g' => Ok(Self::G),
+            b'i' => Ok(Self::I),
+            b'm' => Ok(Self::M),
+            b's' => Ok(Self::S),
+            b'u' => Ok(Self::U),
+            b'y' => Ok(Self::Y),
+            b'd' => Ok(Self::D),
+            b'v' => Ok(Self::V),
+            _ => Err(value),
+        }
+    }
+}
+
 impl fmt::Display for RegExpFlags {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         if self.contains(Self::G) {

diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs
@@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) {
 // /
 ascii_byte_handler!(SLH(lexer) {
     lexer.consume_char();
-    match lexer.peek() {
-        Some('/') => {
+    match lexer.peek_byte() {
+        Some(b'/') => {
             lexer.consume_char();
             lexer.skip_single_line_comment()
         }
-        Some('*') => {
+        Some(b'*') => {
             lexer.consume_char();
             lexer.skip_multi_line_comment()
         }
@@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) {
         } else {
             Kind::Question2
         }
-    } else if lexer.peek() == Some('.') {
+    } else if lexer.peek_byte() == Some(b'.') {
         // parse `?.1` as `?` `.1`
-        if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
+        if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) {
             Kind::Question
         } else {
             lexer.consume_char();

diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs
@@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
     /// Any number of characters can have already been consumed from `self.source` prior to it.
     /// `self.source` should be positioned at start of Unicode character.
     fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
-        let c = self.peek().unwrap();
+        let c = self.peek_char().unwrap();
         if is_identifier_part_unicode(c) {
             self.consume_char();
             self.identifier_tail_after_unicode(start_pos)
@@ -115,7 +115,7 @@ impl<'a> Lexer<'a> {
     pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
         // Identifier contains a Unicode chars, so probably contains more.
         // So just iterate over chars now, instead of bytes.
-        while let Some(c) = self.peek() {
+        while let Some(c) = self.peek_char() {
             if is_identifier_part(c) {
                 self.consume_char();
             } else if c == '\\' {
@@ -177,7 +177,7 @@ impl<'a> Lexer<'a> {
             // Consume chars until reach end of identifier or another escape
             let chunk_start = self.source.position();
             loop {
-                let maybe_char = self.peek();
+                let maybe_char = self.peek_char();
                 if maybe_char.is_some_and(is_identifier_part) {
                     self.consume_char();
                     continue;
@@ -272,7 +272,7 @@ impl<'a> Lexer<'a> {
     fn private_identifier_not_ascii_id(&mut self) -> Kind {
         let b = self.source.peek_byte().unwrap();
         if !b.is_ascii() {
-            let c = self.peek().unwrap();
+            let c = self.peek_char().unwrap();
             if is_identifier_start_unicode(c) {
                 let start_pos = self.source.position();
                 self.consume_char();

diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs
@@ -61,12 +61,12 @@ impl<'a> Lexer<'a> {
     /// `JSXFragment`
     /// { `JSXChildExpressionopt` }
     fn read_jsx_child(&mut self) -> Kind {
-        match self.peek() {
-            Some('<') => {
+        match self.peek_byte() {
+            Some(b'<') => {
                 self.consume_char();
                 Kind::LAngle
             }
-            Some('{') => {
+            Some(b'{') => {
                 self.consume_char();
                 Kind::LCurly
             }
@@ -122,7 +122,7 @@ impl<'a> Lexer<'a> {
             // Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII
             // as fast as possible
             cold_branch(|| {
-                while let Some(c) = self.peek() {
+                while let Some(c) = self.peek_char() {
                     if c == '-' || is_identifier_part(c) {
                         self.consume_char();
                     } else {

diff --git a/crates/oxc_parser/src/lexer/kind.rs b/crates/oxc_parser/src/lexer/kind.rs
@@ -206,11 +206,11 @@ impl Kind {
         )
     }
 
-    pub fn matches_number_char(self, c: char) -> bool {
+    pub fn matches_number_char(self, c: u8) -> bool {
         match self {
             Decimal => c.is_ascii_digit(),
-            Binary => matches!(c, '0'..='1'),
-            Octal => matches!(c, '0'..='7'),
+            Binary => matches!(c, b'0'..=b'1'),
+            Octal => matches!(c, b'0'..=b'7'),
             Hex => c.is_ascii_hexdigit(),
             _ => unreachable!(),
         }

diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
@@ -251,15 +251,27 @@ impl<'a> Lexer<'a> {
         self.source.next_char().unwrap()
     }
 
+    /// Peek the next byte without advancing the position
+    #[inline]
+    fn peek_byte(&self) -> Option<u8> {
+        self.source.peek_byte()
+    }
+
+    /// Peek the next two bytes without advancing the position
+    #[inline]
+    fn peek_2_bytes(&self) -> Option<[u8; 2]> {
+        self.source.peek_2_bytes()
+    }
+
     /// Peek the next char without advancing the position
     #[inline]
-    fn peek(&self) -> Option<char> {
+    fn peek_char(&self) -> Option<char> {
         self.source.peek_char()
     }
 
     /// Peek the next next char without advancing the position
     #[inline]
-    fn peek2(&self) -> Option<char> {
+    fn peek_char2(&self) -> Option<char> {
         self.source.peek_char2()
     }
 
@@ -284,7 +296,7 @@ impl<'a> Lexer<'a> {
     /// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
     fn unexpected_err(&mut self) {
         let offset = self.current_offset();
-        match self.peek() {
+        match self.peek_char() {
             Some(c) => self.error(diagnostics::invalid_character(c, offset)),
             None => self.error(diagnostics::unexpected_end(offset)),
         }

diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs
@@ -6,19 +6,19 @@ use crate::diagnostics;
 impl<'a> Lexer<'a> {
     /// 12.9.3 Numeric Literals with `0` prefix
     pub(super) fn read_zero(&mut self) -> Kind {
-        match self.peek() {
-            Some('b' | 'B') => self.read_non_decimal(Kind::Binary),
-            Some('o' | 'O') => self.read_non_decimal(Kind::Octal),
-            Some('x' | 'X') => self.read_non_decimal(Kind::Hex),
-            Some('e' | 'E') => {
+        match self.peek_byte() {
+            Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary),
+            Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal),
+            Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex),
+            Some(b'e' | b'E') => {
                 self.consume_char();
                 self.read_decimal_exponent()
             }
-            Some('.') => {
+            Some(b'.') => {
                 self.consume_char();
                 self.decimal_literal_after_decimal_point_after_digits()
             }
-            Some('n') => {
+            Some(b'n') => {
                 self.consume_char();
                 self.check_after_numeric_literal(Kind::Decimal)
             }
@@ -42,23 +42,23 @@ impl<'a> Lexer<'a> {
     fn read_non_decimal(&mut self, kind: Kind) -> Kind {
         self.consume_char();
 
-        if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
+        if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
             self.consume_char();
         } else {
             self.unexpected_err();
             return Kind::Undetermined;
         }
 
-        while let Some(c) = self.peek() {
+        while let Some(c) = self.peek_byte() {
             match c {
-                '_' => {
+                b'_' => {
                     self.consume_char();
                     // NOTE: it looks invalid numeric tokens are still parsed.
                     // This seems to be a waste. It also requires us to put this
                     // call here instead of after we ensure the next character
                     // is a number character
                     self.token.set_has_separator();
-                    if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
+                    if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
                         self.consume_char();
                     } else {
                         self.unexpected_err();
@@ -71,35 +71,33 @@ impl<'a> Lexer<'a> {
                 _ => break,
             }
         }
-        if self.peek() == Some('n') {
-            self.consume_char();
-        }
+        self.next_ascii_char_eq(b'n');
         self.check_after_numeric_literal(kind)
     }
 
     fn read_legacy_octal(&mut self) -> Kind {
         let mut kind = Kind::Octal;
         loop {
-            match self.peek() {
-                Some('0'..='7') => {
+            match self.peek_byte() {
+                Some(b'0'..=b'7') => {
                     self.consume_char();
                 }
-                Some('8'..='9') => {
+                Some(b'8'..=b'9') => {
                     self.consume_char();
                     kind = Kind::Decimal;
                 }
                 _ => break,
             }
         }
 
-        match self.peek() {
+        match self.peek_byte() {
             // allow 08.5 and 09.5
-            Some('.') if kind == Kind::Decimal => {
+            Some(b'.') if kind == Kind::Decimal => {
                 self.consume_char();
                 self.decimal_literal_after_decimal_point_after_digits()
             }
             // allow 08e1 and 09e1
-            Some('e') if kind == Kind::Decimal => {
+            Some(b'e') if kind == Kind::Decimal => {
                 self.consume_char();
                 self.read_decimal_exponent()
             }
@@ -108,12 +106,12 @@ impl<'a> Lexer<'a> {
     }
 
     fn read_decimal_exponent(&mut self) -> Kind {
-        let kind = match self.peek() {
-            Some('-') => {
+        let kind = match self.peek_byte() {
+            Some(b'-') => {
                 self.consume_char();
                 Kind::NegativeExponential
             }
-            Some('+') => {
+            Some(b'+') => {
                 self.consume_char();
                 Kind::PositiveExponential
             }
@@ -124,7 +122,7 @@ impl<'a> Lexer<'a> {
     }
 
     fn read_decimal_digits(&mut self) {
-        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+        if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
             self.consume_char();
         } else {
             self.unexpected_err();
@@ -135,23 +133,23 @@ impl<'a> Lexer<'a> {
     }
 
     fn read_decimal_digits_after_first_digit(&mut self) {
-        while let Some(c) = self.peek() {
-            match c {
-                '_' => {
+        while let Some(b) = self.peek_byte() {
+            match b {
+                b'_' => {
                     self.consume_char();
                     // NOTE: it looks invalid numeric tokens are still parsed.
                     // This seems to be a waste. It also requires us to put this
                     // call here instead of after we ensure the next character
                     // is an ASCII digit
                     self.token.set_has_separator();
-                    if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+                    if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
                         self.consume_char();
                     } else {
                         self.unexpected_err();
                         return;
                     }
                 }
-                '0'..='9' => {
+                b'0'..=b'9' => {
                     self.consume_char();
                 }
                 _ => break,
@@ -172,16 +170,14 @@ impl<'a> Lexer<'a> {
     }
 
     fn optional_decimal_digits(&mut self) {
-        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+        if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
             self.consume_char();
-        } else {
-            return;
+            self.read_decimal_digits_after_first_digit();
         }
-        self.read_decimal_digits_after_first_digit();
     }
 
     fn optional_exponent(&mut self) -> Option<Kind> {
-        if matches!(self.peek(), Some('e' | 'E')) {
+        if matches!(self.peek_byte(), Some(b'e' | b'E')) {
             self.consume_char();
             return Some(self.read_decimal_exponent());
         }
@@ -191,12 +187,12 @@ impl<'a> Lexer<'a> {
     fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
         let offset = self.offset();
         // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
-        let c = self.peek();
+        let c = self.peek_char();
         if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) {
             return kind;
         }
         self.consume_char();
-        while let Some(c) = self.peek() {
+        while let Some(c) = self.peek_char() {
             if is_identifier_start(c) {
                 self.consume_char();
             } else {

diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs
@@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token};
 impl<'a> Lexer<'a> {
     /// Section 12.8 Punctuators
     pub(super) fn read_dot(&mut self) -> Kind {
-        if self.peek() == Some('.') && self.peek2() == Some('.') {
+        if self.peek_2_bytes() == Some([b'.', b'.']) {
             self.consume_char();
             self.consume_char();
             return Kind::Dot3;
         }
-        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+        if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
             self.decimal_literal_after_decimal_point()
         } else {
             Kind::Dot
@@ -25,7 +25,7 @@ impl<'a> Lexer<'a> {
             }
         } else if self.next_ascii_char_eq(b'=') {
             Some(Kind::LtEq)
-        } else if self.peek() == Some('!')
+        } else if self.peek_byte() == Some(b'!')
             // SingleLineHTMLOpenComment `<!--` in script mode
             && self.source_type.is_script()
             && self.remaining().starts_with("!--")

diff --git a/crates/oxc_parser/src/lexer/regex.rs b/crates/oxc_parser/src/lexer/regex.rs
@@ -58,14 +58,16 @@ impl<'a> Lexer<'a> {
         let pattern_end = self.offset() - 1; // -1 to exclude `/`
         let mut flags = RegExpFlags::empty();
 
-        while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
+        while let Some(ch @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) =
+            self.peek_byte()
+        {
             self.consume_char();
             let Ok(flag) = RegExpFlags::try_from(ch) else {
-                self.error(diagnostics::reg_exp_flag(ch, self.current_offset()));
+                self.error(diagnostics::reg_exp_flag(ch as char, self.current_offset()));
                 continue;
             };
             if flags.contains(flag) {
-                self.error(diagnostics::reg_exp_flag_twice(ch, self.current_offset()));
+                self.error(diagnostics::reg_exp_flag_twice(ch as char, self.current_offset()));
                 continue;
             }
             flags |= flag;