Renamed UnicodeSegmentTokenizer to UnicodeWordTokenizer. (#75)

rth · Jun 13, 2020 · 727712c · 727712c
1 parent 172838c
commit 727712c
Show file tree

Hide file tree

Showing 13 changed files with 49 additions and 50 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Added Python 3.8 wheel generation [#65](https://github.com/rth/vtext/pull/65)
 - Tokenizers can now be pickled in Python [#73](https://github.com/rth/vtext/pull/73)
 - Only Python 3.6+ is now supported in the Python package.
+- Renamed `UnicodeSegmentTokenizer` to `UnicodeWordTokenizer`.
 
 ### Contributors
 

diff --git a/README.md b/README.md
@@ -10,8 +10,6 @@ NLP in Rust with Python bindings
 This package aims to provide a high performance toolkit for ingesting textual data for
 machine learning applications.
 
-The API is currently unstable.
-
 ### Features
 
  - Tokenization: Regexp tokenizer, Unicode segmentation + language specific rules

diff --git a/benchmarks/bench_tokenizers.py b/benchmarks/bench_tokenizers.py
@@ -4,7 +4,7 @@
 import re
 
 from vtext.tokenize import RegexpTokenizer
-from vtext.tokenize import UnicodeSegmentTokenizer
+from vtext.tokenize import UnicodeWordTokenizer
 from vtext.tokenize import VTextTokenizer
 from vtext.tokenize import CharacterTokenizer
 
@@ -50,12 +50,12 @@ def pyre_tokenizer(txt):
             RegexpTokenizer(pattern=token_regexp).tokenize,
         ),
         (
-            "UnicodeSegmentTokenizer(word_bounds=False)",
-            UnicodeSegmentTokenizer(word_bounds=False).tokenize,
+            "UnicodeWordTokenizer(word_bounds=False)",
+            UnicodeWordTokenizer(word_bounds=False).tokenize,
         ),
         (
-            "UnicodeSegmentTokenizer(word_bounds=True)",
-            UnicodeSegmentTokenizer(word_bounds=True).tokenize,
+            "UnicodeWordTokenizer(word_bounds=True)",
+            UnicodeWordTokenizer(word_bounds=True).tokenize,
         ),
         ("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
         ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),

diff --git a/doc/python-api.rst b/doc/python-api.rst
@@ -21,7 +21,7 @@ vtext.tokenize
    :toctree: generated/
 
    tokenize.RegexpTokenizer
-   tokenize.UnicodeSegmentTokenizer
+   tokenize.UnicodeWordTokenizer
    tokenize.VTextTokenizer
    tokenize.CharacterTokenizer
 

diff --git a/evaluation/eval_tokenization.py b/evaluation/eval_tokenization.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import numpy as np
 
-from vtext.tokenize import UnicodeSegmentTokenizer, VTextTokenizer
+from vtext.tokenize import UnicodeWordTokenizer, VTextTokenizer
 
 try:
     import sacremoses
@@ -70,7 +70,7 @@ def whitespace_split(x):
     ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall),
     (
         "unicode-segmentation",
-        lambda lang: UnicodeSegmentTokenizer(word_bounds=True).tokenize,
+        lambda lang: UnicodeWordTokenizer(word_bounds=True).tokenize,
     ),
     ("vtext", lambda lang: VTextTokenizer(lang).tokenize),
 ]

diff --git a/python/src/lib.rs b/python/src/lib.rs
@@ -184,7 +184,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<vectorize::_HashingVectorizerWrapper>()?;
     m.add_class::<vectorize::_CountVectorizerWrapper>()?;
     m.add_class::<tokenize::BaseTokenizer>()?;
-    m.add_class::<tokenize::UnicodeSegmentTokenizer>()?;
+    m.add_class::<tokenize::UnicodeWordTokenizer>()?;
     m.add_class::<tokenize_sentence::UnicodeSentenceTokenizer>()?;
     m.add_class::<tokenize_sentence::PunctuationTokenizer>()?;
     m.add_class::<tokenize::RegexpTokenizer>()?;

diff --git a/python/src/tokenize.rs b/python/src/tokenize.rs
@@ -33,22 +33,22 @@ impl BaseTokenizer {
 /// ----------
 /// - `Unicode® Standard Annex #29 <http://www.unicode.org/reports/tr29/>`_
 #[pyclass(extends=BaseTokenizer, module="vtext.tokenize")]
-pub struct UnicodeSegmentTokenizer {
-    inner: vtext::tokenize::UnicodeSegmentTokenizer,
+pub struct UnicodeWordTokenizer {
+    inner: vtext::tokenize::UnicodeWordTokenizer,
 }
 
 #[pymethods]
-impl UnicodeSegmentTokenizer {
+impl UnicodeWordTokenizer {
     #[new]
     #[args(word_bounds = true)]
     fn new(word_bounds: bool) -> (Self, BaseTokenizer) {
-        let tokenizer = vtext::tokenize::UnicodeSegmentTokenizerParams::default()
+        let tokenizer = vtext::tokenize::UnicodeWordTokenizerParams::default()
             .word_bounds(word_bounds)
             .build()
             .unwrap();
 
         (
-            UnicodeSegmentTokenizer { inner: tokenizer },
+            UnicodeWordTokenizer { inner: tokenizer },
             BaseTokenizer::new(),
         )
     }
@@ -80,7 +80,7 @@ impl UnicodeSegmentTokenizer {
     /// -------
     /// params : mapping of string to any
     ///          Parameter names mapped to their values.
-    fn get_params(&self) -> PyResult<UnicodeSegmentTokenizerParams> {
+    fn get_params(&self) -> PyResult<UnicodeWordTokenizerParams> {
         Ok(self.inner.params.clone())
     }
 
@@ -89,7 +89,7 @@ impl UnicodeSegmentTokenizer {
     }
 
     pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
-        let mut params: UnicodeSegmentTokenizerParams = deserialize_params(py, state)?;
+        let mut params: UnicodeWordTokenizerParams = deserialize_params(py, state)?;
         self.inner = params.build().unwrap();
         Ok(())
     }

diff --git a/python/vtext/tests/test_common.py b/python/vtext/tests/test_common.py
@@ -9,7 +9,7 @@
 from vtext.tokenize import (
     CharacterTokenizer,
     RegexpTokenizer,
-    UnicodeSegmentTokenizer,
+    UnicodeWordTokenizer,
     VTextTokenizer,
 )
 from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer
@@ -19,7 +19,7 @@
 TOKENIZERS = [
     CharacterTokenizer,
     RegexpTokenizer,
-    UnicodeSegmentTokenizer,
+    UnicodeWordTokenizer,
     VTextTokenizer,
 ]
 

diff --git a/python/vtext/tests/test_tokenize.py b/python/vtext/tests/test_tokenize.py
@@ -9,7 +9,7 @@
 import hypothesis.strategies as st
 
 from vtext.tokenize import (
-    UnicodeSegmentTokenizer,
+    UnicodeWordTokenizer,
     RegexpTokenizer,
     CharacterTokenizer,
     VTextTokenizer,
@@ -19,7 +19,7 @@
 TOKENIZERS = [
     RegexpTokenizer,
     CharacterTokenizer,
-    UnicodeSegmentTokenizer,
+    UnicodeWordTokenizer,
     VTextTokenizer,
 ]
 
@@ -31,17 +31,17 @@ def _pytest_ids(x):
 
 def test_unicode_segment_tokenize():
 
-    tokenizer = UnicodeSegmentTokenizer(word_bounds=False)
+    tokenizer = UnicodeWordTokenizer(word_bounds=False)
     assert tokenizer.tokenize("Today, tomorrow") == ["Today", "tomorrow"]
 
-    tokenizer = UnicodeSegmentTokenizer(word_bounds=True)
+    tokenizer = UnicodeWordTokenizer(word_bounds=True)
     assert tokenizer.tokenize("Today, tomorrow") == ["Today", ",", "tomorrow"]
 
     with pytest.raises(TypeError):
-        UnicodeSegmentTokenizer(word_bounds=1)
+        UnicodeWordTokenizer(word_bounds=1)
 
     with pytest.raises(TypeError):
-        UnicodeSegmentTokenizer().tokenize(2)
+        UnicodeWordTokenizer().tokenize(2)
 
 
 def test_regexp_tokenize():
@@ -72,7 +72,7 @@ def test_character_tokenizer():
     [
         RegexpTokenizer(),
         CharacterTokenizer(),
-        UnicodeSegmentTokenizer(),
+        UnicodeWordTokenizer(),
         VTextTokenizer("en"),
         VTextTokenizer("fr"),
     ],
@@ -87,7 +87,7 @@ def test_tokenize_edge_cases(tokenizer, txt):
     [
         (RegexpTokenizer(), {"pattern": r"\b\w\w+\b"}),
         (CharacterTokenizer(), {"window_size": 4}),
-        (UnicodeSegmentTokenizer(), {"word_bounds": True}),
+        (UnicodeWordTokenizer(), {"word_bounds": True}),
         (VTextTokenizer("en"), {"lang": "en"}),
         (VTextTokenizer("fr"), {"lang": "fr"}),
     ],

diff --git a/python/vtext/tokenize.py b/python/vtext/tokenize.py
@@ -5,15 +5,15 @@
 # modified, or distributed except according to those terms.
 
 from ._lib import BaseTokenizer
-from ._lib import UnicodeSegmentTokenizer
+from ._lib import UnicodeWordTokenizer
 from ._lib import RegexpTokenizer
 from ._lib import VTextTokenizer
 from ._lib import CharacterTokenizer
 
 
 __all__ = [
     "BaseTokenizer",
-    "UnicodeSegmentTokenizer",
+    "UnicodeWordTokenizer",
     "RegexpTokenizer",
     "VTextTokenizer",
     "CharacterTokenizer",

diff --git a/src/tokenize/mod.rs b/src/tokenize/mod.rs
@@ -29,11 +29,11 @@ which would remove all punctuation. A more general approach is to apply unicode
 ```rust
 # let s = "The “brown” fox can't jump 32.3 feet, right?";
 # use vtext::tokenize::*;
-let tokenizer = UnicodeSegmentTokenizer::default();
+let tokenizer = UnicodeWordTokenizer::default();
 let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
 assert_eq!(tokens, &["The", "“", "brown", "”", "fox", "can't", "jump", "32.3", "feet", ",", "right", "?"]);
 ```
-Here `UnicodeSegmentTokenizer` object is a thin wrapper around the
+Here `UnicodeWordTokenizer` object is a thin wrapper around the
 [unicode-segmentation](https://github.com/unicode-rs/unicode-segmentation) crate.
 
 This approach produces better results, however for instance the word "can't" should be tokenized
@@ -133,43 +133,43 @@ impl fmt::Debug for RegexpTokenizer {
 ///
 /// * [Unicode® Standard Annex #29](http://www.unicode.org/reports/tr29/)
 #[derive(Debug, Clone)]
-pub struct UnicodeSegmentTokenizer {
-    pub params: UnicodeSegmentTokenizerParams,
+pub struct UnicodeWordTokenizer {
+    pub params: UnicodeWordTokenizerParams,
 }
 
 /// Builder for the unicode segmentation tokenizer
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
-pub struct UnicodeSegmentTokenizerParams {
+pub struct UnicodeWordTokenizerParams {
     word_bounds: bool,
 }
 
-impl UnicodeSegmentTokenizerParams {
-    pub fn word_bounds(&mut self, value: bool) -> UnicodeSegmentTokenizerParams {
+impl UnicodeWordTokenizerParams {
+    pub fn word_bounds(&mut self, value: bool) -> UnicodeWordTokenizerParams {
         self.word_bounds = value;
         self.clone()
     }
-    pub fn build(&mut self) -> Result<UnicodeSegmentTokenizer, VTextError> {
-        Ok(UnicodeSegmentTokenizer {
+    pub fn build(&mut self) -> Result<UnicodeWordTokenizer, VTextError> {
+        Ok(UnicodeWordTokenizer {
             params: self.clone(),
         })
     }
 }
 
-impl Default for UnicodeSegmentTokenizerParams {
-    fn default() -> UnicodeSegmentTokenizerParams {
-        UnicodeSegmentTokenizerParams { word_bounds: true }
+impl Default for UnicodeWordTokenizerParams {
+    fn default() -> UnicodeWordTokenizerParams {
+        UnicodeWordTokenizerParams { word_bounds: true }
     }
 }
 
-impl Default for UnicodeSegmentTokenizer {
+impl Default for UnicodeWordTokenizer {
     /// Create a new instance
-    fn default() -> UnicodeSegmentTokenizer {
-        UnicodeSegmentTokenizerParams::default().build().unwrap()
+    fn default() -> UnicodeWordTokenizer {
+        UnicodeWordTokenizerParams::default().build().unwrap()
     }
 }
 
-impl Tokenizer for UnicodeSegmentTokenizer {
+impl Tokenizer for UnicodeWordTokenizer {
     /// Tokenize a string
     fn tokenize<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
         if self.params.word_bounds {

diff --git a/src/tokenize/tests.rs b/src/tokenize/tests.rs
@@ -20,7 +20,7 @@ fn test_regexp_tokenizer() {
 fn test_unicode_tokenizer() {
     let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
 
-    let tokenizer = UnicodeSegmentTokenizerParams::default()
+    let tokenizer = UnicodeWordTokenizerParams::default()
         .word_bounds(false)
         .build()
         .unwrap();
@@ -30,7 +30,7 @@ fn test_unicode_tokenizer() {
     ];
     assert_eq!(tokens, b);
 
-    let tokenizer = UnicodeSegmentTokenizer::default();
+    let tokenizer = UnicodeWordTokenizer::default();
     let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
     let b: &[_] = &[
         "The", "quick", "(", "\"", "brown", "\"", ")", "fox", "can't", "jump", "32.3", "feet", ",",
@@ -121,6 +121,6 @@ fn test_character_tokenizer() {
 
 #[test]
 fn test_tokenizer_defaults() {
-    let tokenizer = UnicodeSegmentTokenizer::default();
+    let tokenizer = UnicodeWordTokenizer::default();
     assert_eq!(tokenizer.params.word_bounds, true);
 }
diff --git a/src/vectorize/tests.rs b/src/vectorize/tests.rs
@@ -151,7 +151,7 @@ fn test_dispatch_tokenizer() {
         .build()
         .unwrap();
 
-    let tokenizer = UnicodeSegmentTokenizerParams::default()
+    let tokenizer = UnicodeWordTokenizerParams::default()
         .word_bounds(false)
         .build()
         .unwrap();