perf(syntax): optimize is_identifier_name (#5425)

Optimize `oxc_syntax::identifier::is_identifier_name`. Add a fast path for ASCII, which will be the common case. Only fall back to iterating over `char`s and using the more expensive test functions e.g. `is_identifier_start_unicode` if non-ASCII chars are found.
oxc-project · Sep 3, 2024 · aeda84f · aeda84f
1 parent 3ccb065
commit aeda84f
Showing 1 changed file with 132 additions and 2 deletions.
diff --git a/crates/oxc_syntax/src/identifier.rs b/crates/oxc_syntax/src/identifier.rs
@@ -136,7 +136,137 @@ pub fn is_identifier_part_unicode(c: char) -> bool {
     is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
 }
 
+/// Determine if a string is a valid JS identifier.
+#[allow(clippy::missing_panics_doc)]
 pub fn is_identifier_name(name: &str) -> bool {
-    let mut chars = name.chars();
-    chars.next().is_some_and(is_identifier_start) && chars.all(is_identifier_part)
+    // This function contains a fast path for ASCII (common case), iterating over bytes and using
+    // the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
+    // Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
+    // expensive `is_identifier_start_unicode` and `is_identifier_part`.
+
+    // Get first byte. Exit if empty string.
+    let bytes = name.as_bytes();
+    let Some(&first_byte) = bytes.first() else { return false };
+
+    let mut chars = if first_byte.is_ascii() {
+        // First byte is ASCII
+        if !is_identifier_start_ascii(first_byte as char) {
+            return false;
+        }
+
+        // `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
+        #[allow(clippy::never_loop)]
+        let index = 'outer: loop {
+            for (index, &b) in bytes[1..].iter().enumerate() {
+                if b.is_ascii() {
+                    if !is_identifier_part_ascii(b as char) {
+                        return false;
+                    }
+                } else {
+                    // Unicode byte found
+                    break 'outer index;
+                }
+            }
+            // We got to end without finding any non-identifier of Unicode characters
+            return true;
+        };
+
+        // Unicode byte found - search rest of string (from this byte onwards) as Unicode.
+        // `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
+        name[index + 1..].chars()
+    } else {
+        // First char is Unicode.
+        // NB: `unwrap()` cannot fail because we already checked the string is not empty.
+        let mut chars = name.chars();
+        let first_char = chars.next().unwrap();
+        if !is_identifier_start_unicode(first_char) {
+            return false;
+        }
+        // Search rest of string as Unicode
+        chars
+    };
+
+    // A Unicode char was found - search rest of string as Unicode
+    chars.all(is_identifier_part)
+}
+
+#[test]
+fn is_identifier_name_true() {
+    let cases = [
+        // 1 char ASCII
+        "a",
+        "z",
+        "A",
+        "Z",
+        "_",
+        "$",
+        // 1 char Unicode
+        "µ", // 2 bytes
+        "ख", // 3 bytes
+        "𐀀", // 4 bytes
+        // Multiple chars ASCII
+        "az",
+        "AZ",
+        "_a",
+        "$Z",
+        "a0",
+        "A9",
+        "_0",
+        "$9",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$",
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$",
+        "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$",
+        "$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
+        // Multiple chars Unicode
+        "µख𐀀",
+        // ASCII + Unicode, starting with ASCII
+        "AµBखC𐀀D",
+        // ASCII + Unicode, starting with Unicode
+        "µAखB𐀀",
+    ];
+
+    for str in cases {
+        assert!(is_identifier_name(str));
+    }
+}
+
+#[test]
+fn is_identifier_name_false() {
+    let cases = [
+        // Empty string
+        "",
+        // 1 char ASCII
+        "0",
+        "9",
+        "-",
+        "~",
+        "+",
+        // 1 char Unicode
+        "£", // 2 bytes
+        "৸", // 3 bytes
+        "𐄬", // 4 bytes
+        // Multiple chars ASCII
+        "0a",
+        "9a",
+        "-a",
+        "+a",
+        "a-Z",
+        "A+z",
+        "a-",
+        "a+",
+        // Multiple chars Unicode
+        "£৸𐄬",
+        // ASCII + Unicode, starting with ASCII
+        "A£",
+        "A৸",
+        "A𐄬",
+        // ASCII + Unicode, starting with Unicode
+        "£A",
+        "৸A",
+        "𐄬A",
+    ];
+
+    for str in cases {
+        assert!(!is_identifier_name(str));
+    }
 }