Skip to content

Commit

Permalink
Corpus is now saved to json file
Browse files Browse the repository at this point in the history
  • Loading branch information
tiberiuichim committed Aug 21, 2017
1 parent ed2fde0 commit dc7f2d6
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 62 deletions.
63 changes: 57 additions & 6 deletions src/eea.corpus/eea/corpus/corpus.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from eea.corpus.async import queue
from eea.corpus.processing import build_pipeline
from eea.corpus.utils import corpus_base_path
from eea.corpus.utils import extract_corpus_id
from rq.decorators import job
from textacy import fileio
from textacy import fileio, Corpus
from textacy.doc import Doc
import json
import logging
import os.path
Expand Down Expand Up @@ -31,6 +33,9 @@ def save_corpus_metadata(stats, file_name, corpus_id, text_column, **kw):


class DocStream:
""" A pass-through stream that gathers stats on streamed docs
"""

def __init__(self, docs):
self.docs = docs
self.n_tokens = 0
Expand Down Expand Up @@ -64,13 +69,59 @@ def build_corpus(pipeline, corpus_id, file_name, text_column, **kw):

docs = build_pipeline(file_name, text_column, pipeline, preview_mode=False)

docs = DocStream(docs)
docs = ({'text': doc.text, 'metadata': doc.metadata} for doc in docs)
sdocs = DocStream(docs)
docs = ({'text': doc.text,
'metadata': doc.metadata} for doc in sdocs)

fileio.write_json_lines(
docs, fname, mode='wb',
ensure_ascii=False, separators=(',', ':')
docs, fname, mode='wt',
ensure_ascii=True, separators=(',', ':')
)
save_corpus_metadata(
docs.get_statistics(), file_name, corpus_id, text_column, **kw
sdocs.get_statistics(), file_name, corpus_id, text_column, **kw
)


def load_corpus(file_name, corpus_id):
""" Loads a textacy corpus from disk.
Requires the document name and the corpus id
"""

cpath = corpus_base_path(file_name)
fname = os.path.join(cpath, '%s_docs.json' % corpus_id)

# TODO: we shouldn't hardcode the language
corpus = Corpus('en')

with open(fname, 'rt') as f:
for line in f:
j = json.loads(line)
doc = Doc(j['text'], lang='en', metadata=j['metadata'])
corpus.add_doc(doc)

return corpus


def get_corpus(request, doc=None, corpus_id=None):
if not (doc and corpus_id):
doc, corpus_id = extract_corpus_id(request)

corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
return corpus

# cache = request.corpus_cache
# if corpus_id not in cache.get(doc, []):
# corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
#
# if corpus is None:
# return None
#
# cache[doc] = {
# corpus_id: corpus
# }
#
# try:
# return cache[doc][corpus_id]
# except:
# import pdb; pdb.set_trace()
2 changes: 2 additions & 0 deletions src/eea.corpus/eea/corpus/tests/fixtures/corpusA_docs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"text":"Hello world","metadata":{"1":2}}
{"text":"Second time","metadata":{"3":4}}
48 changes: 39 additions & 9 deletions src/eea.corpus/eea/corpus/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ def test_build_corpus(self, build_pipeline, corpus_base_path, tmpdir):
path = tmpdir.join('.', 'test.csv')
path.mkdir()

build_pipeline.return_value = [Doc('Hello world', metadata={'1': 2})]
build_pipeline.return_value = [
Doc('Hello world', metadata={'1': 2}),
Doc('Second time', metadata={'3': 4}),
]
corpus_base_path.return_value = str(path)

pipeline = []
Expand All @@ -26,24 +29,51 @@ def test_build_corpus(self, build_pipeline, corpus_base_path, tmpdir):

assert path.join('test_eea.json').exists()
assert path.join('test_docs.json').exists()
assert path.join('test_metadatas.json').exists()
assert path.join('test_info.json').exists()

with path.join('test_metadatas.json').open() as f:
meta = json.load(f)
assert meta == {'1': 2}
docs = []
with path.join('test_docs.json').open() as f:
for line in f:
doc = json.loads(line)
docs.append(doc)
assert docs[0] == {'text': 'Hello world', 'metadata': {'1': 2}}
assert docs[1] == {'text': 'Second time', 'metadata': {'3': 4}}
assert len(docs) == 2

with path.join('test_eea.json').open() as f:
meta = json.load(f)
assert meta == {
'description': 'something else',
'title': 'first corpus',
'statistics': {
'docs': 1,
'tokens': 2,
'sentences': 1,
'docs': 2,
'tokens': 4,
'sentences': 2,
'lang': 'en'
},
'kw': {},
'text_column': 'text'
}

@patch('eea.corpus.corpus.corpus_base_path')
def test_load_corpus(self, corpus_base_path):
from pkg_resources import resource_filename
from eea.corpus.corpus import load_corpus
from textacy.doc import Doc

base_path = resource_filename('eea.corpus', 'tests/fixtures/')
corpus_base_path.return_value = base_path

corpus = load_corpus('test.csv', 'corpusA')
assert corpus.n_docs == 2
assert corpus.n_sents == 2
assert corpus.n_tokens == 4

doc = corpus[0]
assert isinstance(doc, Doc)
assert doc.text == 'Hello world'
assert doc.metadata == {'1': 2}

doc = corpus[1]
assert isinstance(doc, Doc)
assert doc.text == 'Second time'
assert doc.metadata == {'3': 4}
44 changes: 0 additions & 44 deletions src/eea.corpus/eea/corpus/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import os
import random
import string
import textacy


logger = logging.getLogger('eea.corpus')
Expand All @@ -20,25 +19,6 @@ def rand(n):
return ''.join(random.sample(string.ascii_uppercase + string.digits, k=n))


def load_corpus(file_name, corpus_id, **kw):
""" Loads a textacy corpus from disk.
Requires the document name and the corpus id
"""

cpath = corpus_base_path(file_name)

if os.listdir(cpath):
assert os.path.exists(corpus_metadata_path(file_name, corpus_id))
# if there are any files, assume the corpus is created
# TODO: check that the corpus is really saved
print("Saved corpus exists, loading", cpath, corpus_id)
# import pdb; pdb.set_trace()
return textacy.Corpus.load(cpath, name=corpus_id)

return None


def corpus_base_path(file_name):
""" Returns the /corpus/var/<filename> folder for an uploaded file
"""
Expand Down Expand Up @@ -99,30 +79,6 @@ def available_corpus(file_name):
return res


def get_corpus(request, doc=None, corpus_id=None):
if not (doc and corpus_id):
doc, corpus_id = extract_corpus_id(request)

corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
return corpus

# cache = request.corpus_cache
# if corpus_id not in cache.get(doc, []):
# corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
#
# if corpus is None:
# return None
#
# cache[doc] = {
# corpus_id: corpus
# }
#
# try:
# return cache[doc][corpus_id]
# except:
# import pdb; pdb.set_trace()


def corpus_metadata_path(file_name, corpus_id):
""" Returns the <corpusid>_eea.json file path for a given doc/corpus
"""
Expand Down
11 changes: 8 additions & 3 deletions src/eea.corpus/eea/corpus/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from deform import ZPTRendererFactory
from eea.corpus.async import queue
from eea.corpus.corpus import build_corpus
from eea.corpus.corpus import get_corpus
from eea.corpus.processing import build_pipeline
from eea.corpus.processing import pipeline_registry
from eea.corpus.schema import CreateCorpusSchema
Expand All @@ -18,9 +19,7 @@
from eea.corpus.utils import delete_corpus
from eea.corpus.utils import document_name
from eea.corpus.utils import extract_corpus_id
from eea.corpus.utils import get_corpus
from eea.corpus.utils import hashed_id
from eea.corpus.utils import metadata
from eea.corpus.utils import rand
from eea.corpus.utils import schema_defaults
from eea.corpus.utils import upload_location
Expand Down Expand Up @@ -104,7 +103,13 @@ def metadata(self):
""" Show metadata about context document
"""
# TODO: show info about processing and column
return metadata(self.corpus())
corpus = self.corpus()
return {
'docs': corpus.n_docs,
'sentences': corpus.n_sents,
'tokens': corpus.n_tokens,
'lang': corpus.lang,
}

def visualise(self, appstruct, method):
max_df = appstruct['max_df']
Expand Down

0 comments on commit dc7f2d6

Please sign in to comment.