Skip to content

Commit

Permalink
Start of refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
tiberiuichim committed Aug 21, 2017
1 parent faf72a6 commit ed2fde0
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 69 deletions.
47 changes: 38 additions & 9 deletions src/eea.corpus/eea/corpus/corpus.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
from eea.corpus.async import queue
from eea.corpus.processing import build_pipeline
from eea.corpus.utils import corpus_base_path
from eea.corpus.utils import is_safe_to_save
from eea.corpus.utils import metadata
from rq.decorators import job
from textacy import fileio
import json
import logging
import os.path
import textacy


logger = logging.getLogger('eea.corpus')


def save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw):
def save_corpus_metadata(stats, file_name, corpus_id, text_column, **kw):
cpath = corpus_base_path(file_name) # corpus_id
meta_name = "{0}_eea.json".format(corpus_id)
meta_path = os.path.join(cpath, meta_name)
Expand All @@ -24,24 +22,55 @@ def save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw):
info = {
'title': title,
'description': description,
'metadata': metadata(corpus),
'statistics': stats,
'text_column': text_column,
'kw': kw,
}
with open(meta_path, 'w') as f:
json.dump(info, f)


class DocStream:
def __init__(self, docs):
self.docs = docs
self.n_tokens = 0
self.n_sents = 0
self.n_docs = 0

def __iter__(self):
for doc in self.docs:
self.n_tokens += doc.n_tokens
self.n_sents += doc.n_sents
self.n_docs += 1
yield doc

def get_statistics(self):
return {
'docs': self.n_docs,
'sentences': self.n_sents,
'tokens': self.n_tokens,
'lang': 'en',
}


@job(queue=queue)
def build_corpus(pipeline, corpus_id, file_name, text_column, **kw):
""" Async job to build a corpus using the provided pipeline
"""

cpath = corpus_base_path(file_name) # corpus_id
fname = os.path.join(cpath, '%s_docs.json' % corpus_id)
logger.info('Creating corpus for %s at %s', file_name, cpath)

docs = build_pipeline(file_name, text_column, pipeline, preview_mode=False)
docs = (doc for doc in docs if is_safe_to_save(doc))
corpus = textacy.Corpus(lang='en', docs=docs)
corpus.save(cpath, name=corpus_id)
save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw)

docs = DocStream(docs)
docs = ({'text': doc.text, 'metadata': doc.metadata} for doc in docs)

fileio.write_json_lines(
docs, fname, mode='wb',
ensure_ascii=False, separators=(',', ':')
)
save_corpus_metadata(
docs.get_statistics(), file_name, corpus_id, text_column, **kw
)
4 changes: 2 additions & 2 deletions src/eea.corpus/eea/corpus/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_build_corpus(self, build_pipeline, corpus_base_path, tmpdir):
build_corpus(pipeline, corpus_id, file_name, text_column, **kw)

assert path.join('test_eea.json').exists()
assert path.join('test_spacy_docs.bin').exists()
assert path.join('test_docs.json').exists()
assert path.join('test_metadatas.json').exists()
assert path.join('test_info.json').exists()

Expand All @@ -38,7 +38,7 @@ def test_build_corpus(self, build_pipeline, corpus_base_path, tmpdir):
assert meta == {
'description': 'something else',
'title': 'first corpus',
'metadata': {
'statistics': {
'docs': 1,
'tokens': 2,
'sentences': 1,
Expand Down
30 changes: 18 additions & 12 deletions src/eea.corpus/eea/corpus/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,24 @@ def test_valid_document_name(self, is_valid_document):
req.matchdict = {'doc': 'first'}
is_valid_document.return_value = True
assert document_name(req) == 'first'

def test_is_safe_to_save(self):
from eea.corpus.utils import is_safe_to_save
from pandas import read_csv
from pkg_resources import resource_filename
from textacy.doc import Doc

fpath = resource_filename('eea.corpus', 'tests/fixtures/broken.csv')
text_col = read_csv(fpath)['text']

assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
assert is_safe_to_save(Doc(text_col[0], lang='en')) is False
#
# def test_is_safe_to_save(self):
# from eea.corpus.utils import is_safe_to_save
# from pandas import read_csv
# from pkg_resources import resource_filename
# from textacy.doc import Doc
#
# fpath = resource_filename('eea.corpus', 'tests/fixtures/broken.csv')
# text_col = read_csv(fpath)['text']
#
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True


class TestConvertorDecorators:
Expand Down
86 changes: 40 additions & 46 deletions src/eea.corpus/eea/corpus/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,31 +100,31 @@ def available_corpus(file_name):


def get_corpus(request, doc=None, corpus_id=None):
cache = request.corpus_cache
if not (doc and corpus_id):
doc, corpus_id = extract_corpus_id(request)

corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
return corpus

if corpus_id not in cache.get(doc, []):
corpus = load_corpus(file_name=doc, corpus_id=corpus_id)

if corpus is None:
return None

cache[doc] = {
corpus_id: corpus
}

try:
return cache[doc][corpus_id]
except:
import pdb; pdb.set_trace()
# cache = request.corpus_cache
# if corpus_id not in cache.get(doc, []):
# corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
#
# if corpus is None:
# return None
#
# cache[doc] = {
# corpus_id: corpus
# }
#
# try:
# return cache[doc][corpus_id]
# except:
# import pdb; pdb.set_trace()


def corpus_metadata_path(file_name, corpus_id):
""" Returns the zzz_eea.json file path for a given doc/corpus
""" Returns the <corpusid>_eea.json file path for a given doc/corpus
"""
cpath = corpus_base_path(file_name) # corpus_id
meta_name = "{0}_eea.json".format(corpus_id)
Expand Down Expand Up @@ -165,7 +165,8 @@ def available_documents(request):

for corpus, cfs in files.items():
if len(cfs) != 4:
logger.warning("Not a valid corpus: %s (%s)", name, corpus)
# logger.warning("Not a valid corpus: %s (%s)", name,
# corpus)
continue
meta = load_corpus_metadata(name, corpus)
corpuses.append((corpus, meta))
Expand All @@ -179,15 +180,6 @@ def available_documents(request):
return res


def metadata(corpus):
return {
'docs': corpus.n_docs,
'sentences': corpus.n_sents,
'tokens': corpus.n_tokens,
'lang': corpus.spacy_lang.lang,
}


def extract_corpus_id(request):
""" Extract document name (aka file_name) from request
"""
Expand Down Expand Up @@ -272,23 +264,25 @@ def tokenize(phrase, delimiter='_'):
return delimiter.join(res)


def is_safe_to_save(doc):
""" Is this doc safe to save?
For some reason there's a bug in saving/loading spacy Docs. Here we test
that the doc can be loaded back from its serialized representation.
For further reference, see:
* https://github.com/explosion/spaCy/issues/1045
* https://github.com/explosion/spaCy/issues/985
"""
text = doc.text[:100]
bs = doc.spacy_doc.to_bytes()
try:
doc.spacy_doc.from_bytes(bs)
return True
except Exception:
logger.warning("Will not save %s, it will not be loadable", text)
return False
# from spacy.tokens.doc import Doc as SpacyDoc
# def is_safe_to_save(doc):
# """ Is this doc safe to save?
#
# For some reason there's a bug in saving/loading spacy Docs. Here we test
# that the doc can be loaded back from its serialized representation.
#
# For further reference, see:
#
# * https://github.com/explosion/spaCy/issues/1045
# * https://github.com/explosion/spaCy/issues/985
#
# """
# text = doc.text[:100]
# vocab = doc.spacy_vocab
# bs = doc.spacy_doc.to_bytes()
# try:
# SpacyDoc(vocab).from_bytes(bs)
# return True
# except Exception:
# logger.warning("Will not save %s, it will not be loadable", text)
# return False

0 comments on commit ed2fde0

Please sign in to comment.