Skip to content

Commit

Permalink
Use Unicode XID_(Start|Continue) for identifiers
Browse files Browse the repository at this point in the history
closes #121
  • Loading branch information
sharkdp committed Jul 23, 2023
1 parent b2be7f0 commit 2e95113
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 11 deletions.
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 27 additions & 3 deletions modules/math/constants.nbt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,31 @@ let quintillion = 10^18

let googol = 10^100

### Misc
### Fractions

let ½ = 1/2
let ¼ = 1/4
let ½ = 1 / 2

let ⅓ = 1 / 3
let ⅔ = 2 / 3

let ¼ = 1 / 4
let ¾ = 3 / 4

let ⅕ = 1 / 5
let ⅖ = 2 / 5
let ⅗ = 3 / 5
let ⅘ = 4 / 5

let ⅙ = 1 / 6
let ⅚ = 5 / 6

let ⅐ = 1 / 7

let ⅛ = 1 / 8
let ⅜ = 3 / 8
let ⅝ = 5 / 8
let ⅞ = 7 / 8

let ⅑ = 1 / 9

let ⅒ = 1 / 10
1 change: 1 addition & 0 deletions numbat/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ strsim = "0.10.0"
pretty_dtoa = "0.3"
numbat-exchange-rates = { version = "0.1.0", path = "../numbat-exchange-rates" }
heck = { version = "0.4.1", features = ["unicode"] }
unicode-ident = "1.0.11"

[dev-dependencies]
approx = "0.5"
Expand Down
44 changes: 38 additions & 6 deletions numbat/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,45 @@ fn is_exponent_char(c: char) -> bool {
matches!(c, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹')
}

fn is_numerical_fraction_char(c: char) -> bool {
matches!(
c,
'¼' | '½'
| '¾'
| '⅐'
| '⅑'
| '⅒'
| '⅓'
| '⅔'
| '⅕'
| '⅖'
| '⅗'
| '⅘'
| '⅙'
| '⅚'
| '⅛'
| '⅜'
| '⅝'
| '⅞'
)
}

fn is_currency_char(c: char) -> bool {
let c_u32 = c as u32;

// See https://en.wikipedia.org/wiki/Currency_Symbols_(Unicode_block)
(0x20A0..=0x20CF).contains(&c_u32) || c == '£' || c == '¥' || c == '$' || c == '฿'
}

fn is_identifier_char(c: char) -> bool {
(c.is_alphanumeric() || c == '_' || is_currency_char(c)) && !is_exponent_char(c)
fn is_identifier_start(c: char) -> bool {
unicode_ident::is_xid_start(c)
|| is_numerical_fraction_char(c)
|| is_currency_char(c)
|| c == '_'
}

fn is_identifier_continue(c: char) -> bool {
(unicode_ident::is_xid_continue(c) || is_currency_char(c)) && !is_exponent_char(c) && c != '·'
}

impl Tokenizer {
Expand Down Expand Up @@ -294,7 +324,7 @@ impl Tokenizer {
if !has_advanced
|| self
.peek()
.map(|c| is_identifier_char(c) || c == '.')
.map(|c| is_identifier_continue(c) || c == '.')
.unwrap_or(false)
{
return tokenizer_error(
Expand Down Expand Up @@ -358,7 +388,9 @@ impl Tokenizer {
);
}
}
'¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹' => TokenKind::UnicodeExponent,
'¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹' => {
TokenKind::UnicodeExponent
}
'°' => TokenKind::Identifier, // '°' is not an alphanumeric character, so we treat it as a special case here
'"' => {
while self.peek().map(|c| c != '"').unwrap_or(false) {
Expand All @@ -379,8 +411,8 @@ impl Tokenizer {
}
}
'…' => TokenKind::Ellipsis,
c if is_identifier_char(c) => {
while self.peek().map(is_identifier_char).unwrap_or(false) {
c if is_identifier_start(c) => {
while self.peek().map(is_identifier_continue).unwrap_or(false) {
self.advance();
}

Expand Down

0 comments on commit 2e95113

Please sign in to comment.