Skip to content

Commit

Permalink
modified: EduNLP/SIF/tokenization/text/tokenization.py
Browse files Browse the repository at this point in the history
	modified:   tests/test_tokenizer/test_tokenizer.py
  • Loading branch information
KINGNEWBLUSH committed Mar 11, 2024
1 parent aea99a2 commit 970c1b9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
3 changes: 1 addition & 2 deletions EduNLP/SIF/tokenization/text/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ def tokenize(text,
tokenizer.load(bpe_json, pretty=True)
except LookupError:
if (bpe_trainfile is None):
raise LookupError("bpe train file not found, using %s." %
bpe_trainfile)
raise LookupError("bpe train file not found, using %s." %bpe_trainfile)
trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files=[bpe_trainfile], trainer=trainer)
Expand Down
8 changes: 6 additions & 2 deletions tests/test_tokenizer/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,12 @@ def test_Tokenizer():
]
for tok in ['nltk', 'spacy']:
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": tok, "stop_words":set(",?")})
text_params={"tokenizer": tok, "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerBPE():
items = ['The stationery store has $600$ exercise books, and after selling some,\
there are still $4$ packs left, $25$ each, how many are sold?']
Expand All @@ -79,11 +80,14 @@ def test_TokenizerBPE():
' ', 'are', ' ', 's', 'o', 'l', 'd']
]
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'bpe', "bpe_trainfile":"../../static/test_data/standard_luna_data.json", "stop_words":set(",?")})
text_params={"tokenizer": 'bpe',
"bpe_trainfile": "../../static/test_data/standard_luna_data.json",
"stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_SpaceTokenizer():
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']
tokenizer = get_tokenizer("space", stop_words=[])
Expand Down

0 comments on commit 970c1b9

Please sign in to comment.