diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 8ca52a1e..1ab9356e 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -103,8 +103,7 @@ def tokenize(text, tokenizer.load(bpe_json, pretty=True) except LookupError: if (bpe_trainfile is None): - raise LookupError("bpe train file not found, using %s." % - bpe_trainfile) + raise LookupError("bpe train file not found, using %s." %bpe_trainfile) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=[bpe_trainfile], trainer=trainer) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index c1f8c698..7d254db7 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -61,11 +61,12 @@ def test_Tokenizer(): ] for tok in ['nltk', 'spacy']: tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": tok, "stop_words":set(",?")}) + text_params={"tokenizer": tok, "stopwords": set(",?")}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans + def test_TokenizerBPE(): items = ['The stationery store has $600$ exercise books, and after selling some,\ there are still $4$ packs left, $25$ each, how many are sold?'] @@ -79,11 +80,14 @@ def test_TokenizerBPE(): ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": 'bpe', "bpe_trainfile":"../../static/test_data/standard_luna_data.json", "stop_words":set(",?")}) + text_params={"tokenizer": 'bpe', + "bpe_trainfile": "../../static/test_data/standard_luna_data.json", + "stopwords": set(",?")}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans + def test_SpaceTokenizer(): items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] tokenizer = get_tokenizer("space", stop_words=[])