Skip to content

Commit

Permalink
Merge pull request #5 from todd-cook/fix_tokenizer_imports
Browse files Browse the repository at this point in the history
Fix broken tokenizer imports
  • Loading branch information
diyclassics committed Jan 26, 2022
2 parents cd6bea9 + 9d79caa commit c36aaf9
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse, sys
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from tensor2tensor.data_generators import text_encoder
import numpy as np
import torch
Expand Down Expand Up @@ -249,7 +249,7 @@ def tokenize(self, text):
def convert_to_toks(sents):

sent_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer('latin')
word_tokenizer = WordTokenizer()

all_sents=[]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse, sys
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from tensor2tensor.data_generators import text_encoder
import numpy as np
import torch
Expand Down Expand Up @@ -64,7 +64,7 @@ def tokenize(self, text):

def read_file(filename):
sent_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer('latin')
word_tokenizer = WordTokenizer()

all_sents=[]
with open(filename, encoding="utf-8") as file:
Expand Down
7 changes: 3 additions & 4 deletions case_studies/infilling/scripts/check_dupes_emendations.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import sys, re
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer

word_tokenizer = WordTokenizer('latin')
word_tokenizer = WordTokenizer()

def filt(text):
text=re.sub("<", "", text)
Expand Down Expand Up @@ -59,4 +58,4 @@ def search(filename, tests):
print("dupe\t%s\t%s\t%s\t%s" % (origq, left, target, right))

tests=read_test(sys.argv[1])
search(sys.argv[2], tests)
search(sys.argv[2], tests)
8 changes: 2 additions & 6 deletions case_studies/infilling/scripts/predict_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@
from tensor2tensor.data_generators import text_encoder
import torch
import numpy as np
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
from torch import nn
import random
from random import randrange

random.seed(1)

Expand All @@ -20,7 +18,7 @@ def __init__(self, encoder):
self.vocab={}
self.reverseVocab={}
self.encoder=encoder
self.word_tokenizer = WordTokenizer('latin')
self.word_tokenizer = WordTokenizer()

self.vocab["[PAD]"]=0
self.vocab["[UNK]"]=1
Expand Down Expand Up @@ -241,5 +239,3 @@ def proc_file(filename, wp_tokenizer, model):
model.to(device)

proc_file(dataFile, wp_tokenizer, model)


4 changes: 2 additions & 2 deletions case_studies/wsd/scripts/create_wsd_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import sys, re
from cltk.tokenize.word import WordTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
from unidecode import unidecode

word_tokenizer = WordTokenizer('latin')
word_tokenizer = WordTokenizer()

def read_lemmas(filename):
lemmadict={}
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pygame==2.0.0.dev6
beautifulsoup4==4.9.1
cltk==0.1.121
cltk==1.0.15
future==0.18.2
numpy==1.18.5
tensor2tensor==1.15.7
Expand Down
6 changes: 3 additions & 3 deletions scripts/gen_berts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse, sys
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer
from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from tensor2tensor.data_generators import text_encoder
import numpy as np
import torch
Expand Down Expand Up @@ -235,7 +235,7 @@ def tokenize(self, text):
def convert_to_toks(sents):

sent_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer('latin')
word_tokenizer = WordTokenizer()

all_sents=[]

Expand Down

0 comments on commit c36aaf9

Please sign in to comment.