Use Unicode XID_(Start|Continue) for identifiers

closes #121
sharkdp · Jul 23, 2023 · 2e95113 · 2e95113
1 parent b2be7f0
commit 2e95113
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 11 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/modules/math/constants.nbt b/modules/math/constants.nbt
@@ -22,7 +22,31 @@ let quintillion =  10^18
 
 let googol =  10^100
 
-### Misc
+### Fractions
 
-let ½ = 1/2
-let ¼ = 1/4
+let ½ = 1 / 2
+
+let ⅓ = 1 / 3
+let ⅔ = 2 / 3
+
+let ¼ = 1 / 4
+let ¾ = 3 / 4
+
+let ⅕ = 1 / 5
+let ⅖ = 2 / 5
+let ⅗ = 3 / 5
+let ⅘ = 4 / 5
+
+let ⅙ = 1 / 6
+let ⅚ = 5 / 6
+
+let ⅐ = 1 / 7
+
+let ⅛ = 1 / 8
+let ⅜ = 3 / 8
+let ⅝ = 5 / 8
+let ⅞ = 7 / 8
+
+let ⅑ = 1 / 9
+
+let ⅒ = 1 / 10
diff --git a/numbat/Cargo.toml b/numbat/Cargo.toml
@@ -21,6 +21,7 @@ strsim = "0.10.0"
 pretty_dtoa = "0.3"
 numbat-exchange-rates = { version = "0.1.0", path = "../numbat-exchange-rates" }
 heck = { version = "0.4.1", features = ["unicode"] }
+unicode-ident = "1.0.11"
 
 [dev-dependencies]
 approx = "0.5"

diff --git a/numbat/src/tokenizer.rs b/numbat/src/tokenizer.rs
@@ -114,15 +114,45 @@ fn is_exponent_char(c: char) -> bool {
     matches!(c, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹')
 }
 
+fn is_numerical_fraction_char(c: char) -> bool {
+    matches!(
+        c,
+        '¼' | '½'
+            | '¾'
+            | '⅐'
+            | '⅑'
+            | '⅒'
+            | '⅓'
+            | '⅔'
+            | '⅕'
+            | '⅖'
+            | '⅗'
+            | '⅘'
+            | '⅙'
+            | '⅚'
+            | '⅛'
+            | '⅜'
+            | '⅝'
+            | '⅞'
+    )
+}
+
 fn is_currency_char(c: char) -> bool {
     let c_u32 = c as u32;
 
     // See https://en.wikipedia.org/wiki/Currency_Symbols_(Unicode_block)
     (0x20A0..=0x20CF).contains(&c_u32) || c == '£' || c == '¥' || c == '$' || c == '฿'
 }
 
-fn is_identifier_char(c: char) -> bool {
-    (c.is_alphanumeric() || c == '_' || is_currency_char(c)) && !is_exponent_char(c)
+fn is_identifier_start(c: char) -> bool {
+    unicode_ident::is_xid_start(c)
+        || is_numerical_fraction_char(c)
+        || is_currency_char(c)
+        || c == '_'
+}
+
+fn is_identifier_continue(c: char) -> bool {
+    (unicode_ident::is_xid_continue(c) || is_currency_char(c)) && !is_exponent_char(c) && c != '·'
 }
 
 impl Tokenizer {
@@ -294,7 +324,7 @@ impl Tokenizer {
                 if !has_advanced
                     || self
                         .peek()
-                        .map(|c| is_identifier_char(c) || c == '.')
+                        .map(|c| is_identifier_continue(c) || c == '.')
                         .unwrap_or(false)
                 {
                     return tokenizer_error(
@@ -358,7 +388,9 @@ impl Tokenizer {
                     );
                 }
             }
-            '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹' => TokenKind::UnicodeExponent,
+            '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹' => {
+                TokenKind::UnicodeExponent
+            }
             '°' => TokenKind::Identifier, // '°' is not an alphanumeric character, so we treat it as a special case here
             '"' => {
                 while self.peek().map(|c| c != '"').unwrap_or(false) {
@@ -379,8 +411,8 @@ impl Tokenizer {
                 }
             }
             '…' => TokenKind::Ellipsis,
-            c if is_identifier_char(c) => {
-                while self.peek().map(is_identifier_char).unwrap_or(false) {
+            c if is_identifier_start(c) => {
+                while self.peek().map(is_identifier_continue).unwrap_or(false) {
                     self.advance();
                 }