Skip to content

Commit

Permalink
Treat ambiguous Modifier_Letters as narrow (#63)
Browse files Browse the repository at this point in the history
* Treat ambiguous `Modifier_Letter`s as narrow

This matches the behavior of common fonts.

Affects 6 characters:
https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5B%3AEast_Asian_Width%3DAmbiguous%3A%5D-%5B%5B%3AScript%3D%2FLatin%7CGreek%7CCyrillic%2F%3A%5D-%5B%5B%3ABlock%3DNumber+Forms%3A%5D%26%5B%3Asubhead%3DRoman+numerals%3A%5D%5D%5D%5D%26%5B%3AModifier_Letter%3A%5D

* Simplify derivation of ambiguous

Use `Letter` general category instead of script and block.
Changes `ℓ` to narrow, matching common fonts
  • Loading branch information
Jules-Bertholet committed Jun 17, 2024
1 parent 8e40640 commit 2517d68
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 32 deletions.
17 changes: 2 additions & 15 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
# - NormalizationTest.txt (for tests only)
# - PropList.txt
# - ReadMe.txt
# - Scripts.txt
# - UnicodeData.txt
# - auxiliary/GraphemeBreakProperty.txt
# - emoji/emoji-data.txt
Expand Down Expand Up @@ -430,22 +429,10 @@ def load_east_asian_widths() -> list[EastAsianWidth]:
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
width_map.append(EastAsianWidth.NARROW)

# Characters from alphabetic scripts are narrow
load_property(
"Scripts.txt",
r"(?:Latin|Greek|Cyrillic)",
lambda cp: (
operator.setitem(width_map, cp, EastAsianWidth.NARROW)
if width_map[cp] == EastAsianWidth.AMBIGUOUS
and not (0x2160 <= cp <= 0x217F) # Roman numerals remain ambiguous
else None
),
)

# Ambiguous `Modifier_Symbol`s are narrow
# Ambiguous `Letter`s and `Modifier_Symbol`s are narrow
load_property(
"extracted/DerivedGeneralCategory.txt",
"Sk",
r"(:?Lu|Ll|Lt|Lm|Lo|Sk)",
lambda cp: (
operator.setitem(width_map, cp, EastAsianWidth.NARROW)
if width_map[cp] == EastAsianWidth.AMBIGUOUS
Expand Down
3 changes: 1 addition & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
//! - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`.
//! 7. All other characters have width 1.
//!
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
Expand Down
24 changes: 9 additions & 15 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1022,17 +1022,17 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
],
#[cfg(feature = "cjk")]
[
0x00, 0x9D, 0x02, 0x02, 0x02, 0x9E, 0x9F, 0xA0, 0x02, 0x04, 0x02, 0x05, 0x06, 0x07, 0x08,
0x00, 0x9D, 0x02, 0x02, 0x02, 0x02, 0x9E, 0x9F, 0x02, 0x04, 0x02, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x02, 0x02, 0x1E, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x02, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x02, 0x2A,
0x02, 0x02, 0x02, 0x02,
],
#[cfg(feature = "cjk")]
[
0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE,
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xB0, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xAF, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0x39, 0x39, 0x39, 0x39,
],
Expand All @@ -1042,23 +1042,23 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x4C, 0x02, 0x02, 0x02, 0x02, 0x02,
0xB1, 0x4E, 0x4F, 0xB2,
0xB0, 0x4E, 0x4F, 0xB1,
],
#[cfg(feature = "cjk")]
[
0x85, 0x86, 0x75, 0x02, 0x02, 0x87, 0x02, 0x02, 0x02, 0x88, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x89, 0x8A, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x8B, 0x8C, 0xB3, 0xB4, 0x8E, 0x02, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
0x02, 0x02, 0x8B, 0x8C, 0xB2, 0xB3, 0x8E, 0x02, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
0x96, 0x02, 0x97, 0x02, 0x02, 0x98, 0x99, 0x9A, 0x9B, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x02, 0x02,
],
]);

#[cfg(feature = "cjk")]
const WIDTH_LEAVES_LEN: usize = 181;
const WIDTH_LEAVES_LEN: usize = 180;
#[cfg(not(feature = "cjk"))]
const WIDTH_LEAVES_LEN: usize = 157;
/// Autogenerated. 181 sub-table(s). Consult [`lookup_width`] for layout info.
/// Autogenerated. 180 sub-table(s). Consult [`lookup_width`] for layout info.
static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
[
0x55, 0x55, 0x75, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
Expand Down Expand Up @@ -1852,12 +1852,6 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
0x55, 0x55,
],
#[cfg(feature = "cjk")]
[
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55, 0x95, 0xA9, 0x59, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
#[cfg(feature = "cjk")]
[
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55,
Expand All @@ -1883,7 +1877,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
#[cfg(feature = "cjk")]
[
0x95, 0x59, 0x59, 0x55, 0x95, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x95, 0x59, 0x59, 0x55, 0x55, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
0x5A, 0x55,
],
Expand Down
8 changes: 8 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ fn test_default_ignorable() {
assert_width!('\u{E0000}', Some(0), Some(0));
}

#[test]
fn test_ambiguous() {
assert_width!("\u{B7}", 1, 2);
assert_width!("\u{0387}", 1, 2);
assert_width!("\u{A8}", 1, 1);
assert_width!("\u{02C9}", 1, 1);
}

#[test]
fn test_jamo() {
assert_width!('\u{1100}', Some(2), Some(2));
Expand Down

0 comments on commit 2517d68

Please sign in to comment.