Skip to content

Commit

Permalink
Merge pull request #68 from unicode-rs/unicode-11
Browse files Browse the repository at this point in the history
Update to Unicode 11
  • Loading branch information
Manishearth committed Oct 30, 2019
2 parents 7be58ca + df71866 commit b159d9e
Show file tree
Hide file tree
Showing 8 changed files with 2,284 additions and 2,394 deletions.
37 changes: 26 additions & 11 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,21 @@
# these are the surrogate codepoints, which are not valid rust characters
surrogate_codepoints = (0xd800, 0xdfff)

UNICODE_VERSION = (11, 0, 0)

UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

def is_surrogate(n):
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
% f)
if "emoji" in f:
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
else:
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
% (UNICODE_VERSION_NUMBER, f))

if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
Expand Down Expand Up @@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
pub use self::%sCat::*;
#[allow(non_camel_case_types)]
#[derive(Clone, Copy, PartialEq, Eq)]
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum %sCat {
""" % (name, Name, Name))

Expand Down Expand Up @@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)

# download and parse all the data
fetch("ReadMe.txt")
with open("ReadMe.txt") as readme:
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
/// that this version of unicode-segmentation is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)
""" % UNICODE_VERSION)

# download and parse all the data
gencats = load_gencats("UnicodeData.txt")
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])

Expand All @@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
grapheme_table = []
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
grapheme_table.sort(key=lambda w: w[0])
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
last = -1
for chars in grapheme_table:
if chars[0] <= last:
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
last = chars[1]
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
Expand All @@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, list(word_cats.keys()), "word")

# There are some emoji which are also ALetter, so this needs to be stored separately
# For efficiency, we could still merge the two tables and produce an ALetterEP state
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
Expand Down
6 changes: 3 additions & 3 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def create_grapheme_data(f):
stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

Expand All @@ -187,7 +187,7 @@ def create_words_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
Expand All @@ -201,7 +201,7 @@ def create_sentence_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
Expand Down
23 changes: 13 additions & 10 deletions src/grapheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ enum GraphemeState {
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
Regional,
// The codepoint after is in the E_Modifier category, so whether it's a boundary
// depends on pre-context according to GB10.
// The codepoint after is Extended_Pictographic,
// so whether it's a boundary depends on pre-context according to GB11.
Emoji,
}

Expand Down Expand Up @@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
}
Expand Down Expand Up @@ -415,10 +411,17 @@ impl GraphemeCursor {

fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
use tables::grapheme as gr;
for ch in chunk.chars().rev() {
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if gr::grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
match gr::grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
gr::GC_Extended_Pictographic => {
self.decide(false);
return;
}
Expand Down Expand Up @@ -484,7 +487,7 @@ impl GraphemeCursor {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
//!
//! let s = "The quick (\"brown\") fox";
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
//! assert_eq!(w, b);
//! }
//! ```
Expand Down Expand Up @@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
Expand Down
Loading

0 comments on commit b159d9e

Please sign in to comment.