Skip to content

Commit

Permalink
Renamed UnicodeSegmentTokenizer to UnicodeWordTokenizer. (#75)
Browse files Browse the repository at this point in the history
  • Loading branch information
rth committed Jun 13, 2020
1 parent 172838c commit 727712c
Show file tree
Hide file tree
Showing 13 changed files with 49 additions and 50 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Added Python 3.8 wheel generation [#65](https://github.com/rth/vtext/pull/65)
- Tokenizers can now be pickled in Python [#73](https://github.com/rth/vtext/pull/73)
- Only Python 3.6+ is now supported in the Python package.
- Renamed `UnicodeSegmentTokenizer` to `UnicodeWordTokenizer`.

### Contributors

Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ NLP in Rust with Python bindings
This package aims to provide a high performance toolkit for ingesting textual data for
machine learning applications.

The API is currently unstable.

### Features

- Tokenization: Regexp tokenizer, Unicode segmentation + language specific rules
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/bench_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re

from vtext.tokenize import RegexpTokenizer
from vtext.tokenize import UnicodeSegmentTokenizer
from vtext.tokenize import UnicodeWordTokenizer
from vtext.tokenize import VTextTokenizer
from vtext.tokenize import CharacterTokenizer

Expand Down Expand Up @@ -50,12 +50,12 @@ def pyre_tokenizer(txt):
RegexpTokenizer(pattern=token_regexp).tokenize,
),
(
"UnicodeSegmentTokenizer(word_bounds=False)",
UnicodeSegmentTokenizer(word_bounds=False).tokenize,
"UnicodeWordTokenizer(word_bounds=False)",
UnicodeWordTokenizer(word_bounds=False).tokenize,
),
(
"UnicodeSegmentTokenizer(word_bounds=True)",
UnicodeSegmentTokenizer(word_bounds=True).tokenize,
"UnicodeWordTokenizer(word_bounds=True)",
UnicodeWordTokenizer(word_bounds=True).tokenize,
),
("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),
Expand Down
2 changes: 1 addition & 1 deletion doc/python-api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ vtext.tokenize
:toctree: generated/

tokenize.RegexpTokenizer
tokenize.UnicodeSegmentTokenizer
tokenize.UnicodeWordTokenizer
tokenize.VTextTokenizer
tokenize.CharacterTokenizer

Expand Down
4 changes: 2 additions & 2 deletions evaluation/eval_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
import numpy as np

from vtext.tokenize import UnicodeSegmentTokenizer, VTextTokenizer
from vtext.tokenize import UnicodeWordTokenizer, VTextTokenizer

try:
import sacremoses
Expand Down Expand Up @@ -70,7 +70,7 @@ def whitespace_split(x):
("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall),
(
"unicode-segmentation",
lambda lang: UnicodeSegmentTokenizer(word_bounds=True).tokenize,
lambda lang: UnicodeWordTokenizer(word_bounds=True).tokenize,
),
("vtext", lambda lang: VTextTokenizer(lang).tokenize),
]
Expand Down
2 changes: 1 addition & 1 deletion python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<vectorize::_HashingVectorizerWrapper>()?;
m.add_class::<vectorize::_CountVectorizerWrapper>()?;
m.add_class::<tokenize::BaseTokenizer>()?;
m.add_class::<tokenize::UnicodeSegmentTokenizer>()?;
m.add_class::<tokenize::UnicodeWordTokenizer>()?;
m.add_class::<tokenize_sentence::UnicodeSentenceTokenizer>()?;
m.add_class::<tokenize_sentence::PunctuationTokenizer>()?;
m.add_class::<tokenize::RegexpTokenizer>()?;
Expand Down
14 changes: 7 additions & 7 deletions python/src/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,22 @@ impl BaseTokenizer {
/// ----------
/// - `Unicode® Standard Annex #29 <http://www.unicode.org/reports/tr29/>`_
#[pyclass(extends=BaseTokenizer, module="vtext.tokenize")]
pub struct UnicodeSegmentTokenizer {
inner: vtext::tokenize::UnicodeSegmentTokenizer,
pub struct UnicodeWordTokenizer {
inner: vtext::tokenize::UnicodeWordTokenizer,
}

#[pymethods]
impl UnicodeSegmentTokenizer {
impl UnicodeWordTokenizer {
#[new]
#[args(word_bounds = true)]
fn new(word_bounds: bool) -> (Self, BaseTokenizer) {
let tokenizer = vtext::tokenize::UnicodeSegmentTokenizerParams::default()
let tokenizer = vtext::tokenize::UnicodeWordTokenizerParams::default()
.word_bounds(word_bounds)
.build()
.unwrap();

(
UnicodeSegmentTokenizer { inner: tokenizer },
UnicodeWordTokenizer { inner: tokenizer },
BaseTokenizer::new(),
)
}
Expand Down Expand Up @@ -80,7 +80,7 @@ impl UnicodeSegmentTokenizer {
/// -------
/// params : mapping of string to any
/// Parameter names mapped to their values.
fn get_params(&self) -> PyResult<UnicodeSegmentTokenizerParams> {
fn get_params(&self) -> PyResult<UnicodeWordTokenizerParams> {
Ok(self.inner.params.clone())
}

Expand All @@ -89,7 +89,7 @@ impl UnicodeSegmentTokenizer {
}

pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
let mut params: UnicodeSegmentTokenizerParams = deserialize_params(py, state)?;
let mut params: UnicodeWordTokenizerParams = deserialize_params(py, state)?;
self.inner = params.build().unwrap();
Ok(())
}
Expand Down
4 changes: 2 additions & 2 deletions python/vtext/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vtext.tokenize import (
CharacterTokenizer,
RegexpTokenizer,
UnicodeSegmentTokenizer,
UnicodeWordTokenizer,
VTextTokenizer,
)
from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer
Expand All @@ -19,7 +19,7 @@
TOKENIZERS = [
CharacterTokenizer,
RegexpTokenizer,
UnicodeSegmentTokenizer,
UnicodeWordTokenizer,
VTextTokenizer,
]

Expand Down
16 changes: 8 additions & 8 deletions python/vtext/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import hypothesis.strategies as st

from vtext.tokenize import (
UnicodeSegmentTokenizer,
UnicodeWordTokenizer,
RegexpTokenizer,
CharacterTokenizer,
VTextTokenizer,
Expand All @@ -19,7 +19,7 @@
TOKENIZERS = [
RegexpTokenizer,
CharacterTokenizer,
UnicodeSegmentTokenizer,
UnicodeWordTokenizer,
VTextTokenizer,
]

Expand All @@ -31,17 +31,17 @@ def _pytest_ids(x):

def test_unicode_segment_tokenize():

tokenizer = UnicodeSegmentTokenizer(word_bounds=False)
tokenizer = UnicodeWordTokenizer(word_bounds=False)
assert tokenizer.tokenize("Today, tomorrow") == ["Today", "tomorrow"]

tokenizer = UnicodeSegmentTokenizer(word_bounds=True)
tokenizer = UnicodeWordTokenizer(word_bounds=True)
assert tokenizer.tokenize("Today, tomorrow") == ["Today", ",", "tomorrow"]

with pytest.raises(TypeError):
UnicodeSegmentTokenizer(word_bounds=1)
UnicodeWordTokenizer(word_bounds=1)

with pytest.raises(TypeError):
UnicodeSegmentTokenizer().tokenize(2)
UnicodeWordTokenizer().tokenize(2)


def test_regexp_tokenize():
Expand Down Expand Up @@ -72,7 +72,7 @@ def test_character_tokenizer():
[
RegexpTokenizer(),
CharacterTokenizer(),
UnicodeSegmentTokenizer(),
UnicodeWordTokenizer(),
VTextTokenizer("en"),
VTextTokenizer("fr"),
],
Expand All @@ -87,7 +87,7 @@ def test_tokenize_edge_cases(tokenizer, txt):
[
(RegexpTokenizer(), {"pattern": r"\b\w\w+\b"}),
(CharacterTokenizer(), {"window_size": 4}),
(UnicodeSegmentTokenizer(), {"word_bounds": True}),
(UnicodeWordTokenizer(), {"word_bounds": True}),
(VTextTokenizer("en"), {"lang": "en"}),
(VTextTokenizer("fr"), {"lang": "fr"}),
],
Expand Down
4 changes: 2 additions & 2 deletions python/vtext/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
# modified, or distributed except according to those terms.

from ._lib import BaseTokenizer
from ._lib import UnicodeSegmentTokenizer
from ._lib import UnicodeWordTokenizer
from ._lib import RegexpTokenizer
from ._lib import VTextTokenizer
from ._lib import CharacterTokenizer


__all__ = [
"BaseTokenizer",
"UnicodeSegmentTokenizer",
"UnicodeWordTokenizer",
"RegexpTokenizer",
"VTextTokenizer",
"CharacterTokenizer",
Expand Down
32 changes: 16 additions & 16 deletions src/tokenize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ which would remove all punctuation. A more general approach is to apply unicode
```rust
# let s = "The “brown” fox can't jump 32.3 feet, right?";
# use vtext::tokenize::*;
let tokenizer = UnicodeSegmentTokenizer::default();
let tokenizer = UnicodeWordTokenizer::default();
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
assert_eq!(tokens, &["The", "“", "brown", "”", "fox", "can't", "jump", "32.3", "feet", ",", "right", "?"]);
```
Here `UnicodeSegmentTokenizer` object is a thin wrapper around the
Here `UnicodeWordTokenizer` object is a thin wrapper around the
[unicode-segmentation](https://github.com/unicode-rs/unicode-segmentation) crate.
This approach produces better results, however for instance the word "can't" should be tokenized
Expand Down Expand Up @@ -133,43 +133,43 @@ impl fmt::Debug for RegexpTokenizer {
///
/// * [Unicode® Standard Annex #29](http://www.unicode.org/reports/tr29/)
#[derive(Debug, Clone)]
pub struct UnicodeSegmentTokenizer {
pub params: UnicodeSegmentTokenizerParams,
pub struct UnicodeWordTokenizer {
pub params: UnicodeWordTokenizerParams,
}

/// Builder for the unicode segmentation tokenizer
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
pub struct UnicodeSegmentTokenizerParams {
pub struct UnicodeWordTokenizerParams {
word_bounds: bool,
}

impl UnicodeSegmentTokenizerParams {
pub fn word_bounds(&mut self, value: bool) -> UnicodeSegmentTokenizerParams {
impl UnicodeWordTokenizerParams {
pub fn word_bounds(&mut self, value: bool) -> UnicodeWordTokenizerParams {
self.word_bounds = value;
self.clone()
}
pub fn build(&mut self) -> Result<UnicodeSegmentTokenizer, VTextError> {
Ok(UnicodeSegmentTokenizer {
pub fn build(&mut self) -> Result<UnicodeWordTokenizer, VTextError> {
Ok(UnicodeWordTokenizer {
params: self.clone(),
})
}
}

impl Default for UnicodeSegmentTokenizerParams {
fn default() -> UnicodeSegmentTokenizerParams {
UnicodeSegmentTokenizerParams { word_bounds: true }
impl Default for UnicodeWordTokenizerParams {
fn default() -> UnicodeWordTokenizerParams {
UnicodeWordTokenizerParams { word_bounds: true }
}
}

impl Default for UnicodeSegmentTokenizer {
impl Default for UnicodeWordTokenizer {
/// Create a new instance
fn default() -> UnicodeSegmentTokenizer {
UnicodeSegmentTokenizerParams::default().build().unwrap()
fn default() -> UnicodeWordTokenizer {
UnicodeWordTokenizerParams::default().build().unwrap()
}
}

impl Tokenizer for UnicodeSegmentTokenizer {
impl Tokenizer for UnicodeWordTokenizer {
/// Tokenize a string
fn tokenize<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
if self.params.word_bounds {
Expand Down
6 changes: 3 additions & 3 deletions src/tokenize/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fn test_regexp_tokenizer() {
fn test_unicode_tokenizer() {
let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";

let tokenizer = UnicodeSegmentTokenizerParams::default()
let tokenizer = UnicodeWordTokenizerParams::default()
.word_bounds(false)
.build()
.unwrap();
Expand All @@ -30,7 +30,7 @@ fn test_unicode_tokenizer() {
];
assert_eq!(tokens, b);

let tokenizer = UnicodeSegmentTokenizer::default();
let tokenizer = UnicodeWordTokenizer::default();
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
let b: &[_] = &[
"The", "quick", "(", "\"", "brown", "\"", ")", "fox", "can't", "jump", "32.3", "feet", ",",
Expand Down Expand Up @@ -121,6 +121,6 @@ fn test_character_tokenizer() {

#[test]
fn test_tokenizer_defaults() {
let tokenizer = UnicodeSegmentTokenizer::default();
let tokenizer = UnicodeWordTokenizer::default();
assert_eq!(tokenizer.params.word_bounds, true);
}
2 changes: 1 addition & 1 deletion src/vectorize/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ fn test_dispatch_tokenizer() {
.build()
.unwrap();

let tokenizer = UnicodeSegmentTokenizerParams::default()
let tokenizer = UnicodeWordTokenizerParams::default()
.word_bounds(false)
.build()
.unwrap();
Expand Down

0 comments on commit 727712c

Please sign in to comment.