Merge pull request #68 from unicode-rs/unicode-11

Update to Unicode 11
unicode-rs · Oct 30, 2019 · b159d9e · b159d9e
2 parents 7be58ca + df71866
commit b159d9e
Show file tree

Hide file tree

Showing 8 changed files with 2,284 additions and 2,394 deletions.
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -54,13 +54,21 @@
 # these are the surrogate codepoints, which are not valid rust characters
 surrogate_codepoints = (0xd800, 0xdfff)
 
+UNICODE_VERSION = (11, 0, 0)
+
+UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
+
 def is_surrogate(n):
     return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
 
 def fetch(f):
     if not os.path.exists(os.path.basename(f)):
-        os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
-                  % f)
+        if "emoji" in f:
+            os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
+                      % (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
+        else:
+            os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
+                      % (UNICODE_VERSION_NUMBER, f))
 
     if not os.path.exists(os.path.basename(f)):
         sys.stderr.write("cannot load %s" % f)
@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
     pub use self::%sCat::*;
 
     #[allow(non_camel_case_types)]
-    #[derive(Clone, Copy, PartialEq, Eq)]
+    #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     pub enum %sCat {
 """ % (name, Name, Name))
 
@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
     with open(r, "w") as rf:
         # write the file's preamble
         rf.write(preamble)
-
-        # download and parse all the data
-        fetch("ReadMe.txt")
-        with open("ReadMe.txt") as readme:
-            pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
-            unicode_version = re.search(pattern, readme.read()).groups()
         rf.write("""
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-segmentation is based on.
 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
-""" % unicode_version)
+""" % UNICODE_VERSION)
 
+        # download and parse all the data
         gencats = load_gencats("UnicodeData.txt")
         derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
 
@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
         grapheme_table = []
         for cat in grapheme_cats:
             grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
+        emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
+        grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
         grapheme_table.sort(key=lambda w: w[0])
-        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
+        last = -1
+        for chars in grapheme_table:
+            if chars[0] <= last:
+                raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
+            last = chars[1]
+        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
         rf.write("\n")
 
         word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
         word_table.sort(key=lambda w: w[0])
         emit_break_module(rf, word_table, list(word_cats.keys()), "word")
 
+        # There are some emoji which are also ALetter, so this needs to be stored separately
+        # For efficiency, we could still merge the two tables and produce an ALetterEP state
+        emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
+        emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
+
         sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
         sentence_table = []
         for cat in sentence_cats:

diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
@@ -172,7 +172,7 @@ def create_grapheme_data(f):
     stype = "&'static [(&'static str, &'static [&'static str])]"
     dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
     f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
+    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
     unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
     unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
 
@@ -187,7 +187,7 @@ def create_words_data(f):
 
     wtype = "&'static [(&'static str, &'static [&'static str])]"
     f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
+    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
     unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
 
 def create_sentence_data(f):
@@ -201,7 +201,7 @@ def create_sentence_data(f):
 
     wtype = "&'static [(&'static str, &'static [&'static str])]"
     f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
+    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
     unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
 
 if __name__ == "__main__":

diff --git a/src/grapheme.rs b/src/grapheme.rs
@@ -147,8 +147,8 @@ enum GraphemeState {
     // The codepoint after is a Regional Indicator Symbol, so a boundary iff
     // it is preceded by an even number of RIS codepoints. (GB12, GB13)
     Regional,
-    // The codepoint after is in the E_Modifier category, so whether it's a boundary
-    // depends on pre-context according to GB10.
+    // The codepoint after is Extended_Pictographic,
+    // so whether it's a boundary depends on pre-context according to GB11.
     Emoji,
 }
 
@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
         (_, GC_ZWJ) => NotBreak,  // GB9
         (_, GC_SpacingMark) => Extended,  // GB9a
         (GC_Prepend, _) => Extended,  // GB9b
-        (GC_E_Base, GC_E_Modifier) => NotBreak,  // GB10
-        (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak,  // GB10
-        (GC_Extend, GC_E_Modifier) => Emoji,  // GB10
-        (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak,  // GB11
-        (GC_ZWJ, GC_E_Base_GAZ) => NotBreak,  // GB11
+        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,  // GB11
         (GC_Regional_Indicator, GC_Regional_Indicator) => Regional,  // GB12, GB13
         (_, _) => Break,  // GB999
     }
@@ -415,10 +411,17 @@ impl GraphemeCursor {
 
     fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
         use tables::grapheme as gr;
-        for ch in chunk.chars().rev() {
+        let mut iter = chunk.chars().rev();
+        if let Some(ch) = iter.next() {
+            if gr::grapheme_category(ch) != gr::GC_ZWJ {
+                self.decide(true);
+                return;
+            }
+        }
+        for ch in iter {
             match gr::grapheme_category(ch) {
                 gr::GC_Extend => (),
-                gr::GC_E_Base | gr::GC_E_Base_GAZ => {
+                gr::GC_Extended_Pictographic => {
                     self.decide(false);
                     return;
                 }
@@ -484,7 +487,7 @@ impl GraphemeCursor {
             let mut need_pre_context = true;
             match self.cat_after.unwrap() {
                 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
-                gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
+                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
                 _ => need_pre_context = self.cat_before.is_none(),
             }
             if need_pre_context {

diff --git a/src/lib.rs b/src/lib.rs
@@ -29,7 +29,7 @@
 //!
 //!     let s = "The quick (\"brown\")  fox";
 //!     let w = s.split_word_bounds().collect::<Vec<&str>>();
-//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
 //!     assert_eq!(w, b);
 //! }
 //! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
     /// ```
     /// # use self::unicode_segmentation::UnicodeSegmentation;
     /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
-    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```