Skip to content

Commit

Permalink
perf(syntax): optimize is_identifier_name (#5425)
Browse files Browse the repository at this point in the history
Optimize `oxc_syntax::identifier::is_identifier_name`. Add a fast path for ASCII, which will be the common case. Only fall back to iterating over `char`s and using the more expensive test functions e.g. `is_identifier_start_unicode` if non-ASCII chars are found.
  • Loading branch information
overlookmotel committed Sep 3, 2024
1 parent 3ccb065 commit aeda84f
Showing 1 changed file with 132 additions and 2 deletions.
134 changes: 132 additions & 2 deletions crates/oxc_syntax/src/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,137 @@ pub fn is_identifier_part_unicode(c: char) -> bool {
is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
}

/// Determine if a string is a valid JS identifier.
#[allow(clippy::missing_panics_doc)]
pub fn is_identifier_name(name: &str) -> bool {
let mut chars = name.chars();
chars.next().is_some_and(is_identifier_start) && chars.all(is_identifier_part)
// This function contains a fast path for ASCII (common case), iterating over bytes and using
// the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
// Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
// expensive `is_identifier_start_unicode` and `is_identifier_part`.

// Get first byte. Exit if empty string.
let bytes = name.as_bytes();
let Some(&first_byte) = bytes.first() else { return false };

let mut chars = if first_byte.is_ascii() {
// First byte is ASCII
if !is_identifier_start_ascii(first_byte as char) {
return false;
}

// `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
#[allow(clippy::never_loop)]
let index = 'outer: loop {
for (index, &b) in bytes[1..].iter().enumerate() {
if b.is_ascii() {
if !is_identifier_part_ascii(b as char) {
return false;
}
} else {
// Unicode byte found
break 'outer index;
}
}
// We got to end without finding any non-identifier of Unicode characters
return true;
};

// Unicode byte found - search rest of string (from this byte onwards) as Unicode.
// `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
name[index + 1..].chars()
} else {
// First char is Unicode.
// NB: `unwrap()` cannot fail because we already checked the string is not empty.
let mut chars = name.chars();
let first_char = chars.next().unwrap();
if !is_identifier_start_unicode(first_char) {
return false;
}
// Search rest of string as Unicode
chars
};

// A Unicode char was found - search rest of string as Unicode
chars.all(is_identifier_part)
}

#[test]
fn is_identifier_name_true() {
let cases = [
// 1 char ASCII
"a",
"z",
"A",
"Z",
"_",
"$",
// 1 char Unicode
"µ", // 2 bytes
"ख", // 3 bytes
"𐀀", // 4 bytes
// Multiple chars ASCII
"az",
"AZ",
"_a",
"$Z",
"a0",
"A9",
"_0",
"$9",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$",
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$",
"$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
// Multiple chars Unicode
"µख𐀀",
// ASCII + Unicode, starting with ASCII
"AµBखC𐀀D",
// ASCII + Unicode, starting with Unicode
"µAखB𐀀",
];

for str in cases {
assert!(is_identifier_name(str));
}
}

#[test]
fn is_identifier_name_false() {
let cases = [
// Empty string
"",
// 1 char ASCII
"0",
"9",
"-",
"~",
"+",
// 1 char Unicode
"£", // 2 bytes
"৸", // 3 bytes
"𐄬", // 4 bytes
// Multiple chars ASCII
"0a",
"9a",
"-a",
"+a",
"a-Z",
"A+z",
"a-",
"a+",
// Multiple chars Unicode
"£৸𐄬",
// ASCII + Unicode, starting with ASCII
"A£",
"A৸",
"A𐄬",
// ASCII + Unicode, starting with Unicode
"£A",
"৸A",
"𐄬A",
];

for str in cases {
assert!(!is_identifier_name(str));
}
}

0 comments on commit aeda84f

Please sign in to comment.