Start of refactoring

eea · Aug 21, 2017 · ed2fde0 · ed2fde0
1 parent faf72a6
commit ed2fde0
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 69 deletions.
diff --git a/src/eea.corpus/eea/corpus/corpus.py b/src/eea.corpus/eea/corpus/corpus.py
@@ -1,19 +1,17 @@
 from eea.corpus.async import queue
 from eea.corpus.processing import build_pipeline
 from eea.corpus.utils import corpus_base_path
-from eea.corpus.utils import is_safe_to_save
-from eea.corpus.utils import metadata
 from rq.decorators import job
+from textacy import fileio
 import json
 import logging
 import os.path
-import textacy
 
 
 logger = logging.getLogger('eea.corpus')
 
 
-def save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw):
+def save_corpus_metadata(stats, file_name, corpus_id, text_column, **kw):
     cpath = corpus_base_path(file_name)      # corpus_id
     meta_name = "{0}_eea.json".format(corpus_id)
     meta_path = os.path.join(cpath, meta_name)
@@ -24,24 +22,55 @@ def save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw):
     info = {
         'title': title,
         'description': description,
-        'metadata': metadata(corpus),
+        'statistics': stats,
         'text_column': text_column,
         'kw': kw,
     }
     with open(meta_path, 'w') as f:
         json.dump(info, f)
 
 
+class DocStream:
+    def __init__(self, docs):
+        self.docs = docs
+        self.n_tokens = 0
+        self.n_sents = 0
+        self.n_docs = 0
+
+    def __iter__(self):
+        for doc in self.docs:
+            self.n_tokens += doc.n_tokens
+            self.n_sents += doc.n_sents
+            self.n_docs += 1
+            yield doc
+
+    def get_statistics(self):
+        return {
+            'docs': self.n_docs,
+            'sentences': self.n_sents,
+            'tokens': self.n_tokens,
+            'lang': 'en',
+        }
+
+
 @job(queue=queue)
 def build_corpus(pipeline, corpus_id, file_name, text_column, **kw):
     """ Async job to build a corpus using the provided pipeline
     """
 
     cpath = corpus_base_path(file_name)      # corpus_id
+    fname = os.path.join(cpath, '%s_docs.json' % corpus_id)
     logger.info('Creating corpus for %s at %s', file_name, cpath)
 
     docs = build_pipeline(file_name, text_column, pipeline, preview_mode=False)
-    docs = (doc for doc in docs if is_safe_to_save(doc))
-    corpus = textacy.Corpus(lang='en', docs=docs)
-    corpus.save(cpath, name=corpus_id)
-    save_corpus_metadata(corpus, file_name, corpus_id, text_column, **kw)
+
+    docs = DocStream(docs)
+    docs = ({'text': doc.text, 'metadata': doc.metadata} for doc in docs)
+
+    fileio.write_json_lines(
+        docs, fname, mode='wb',
+        ensure_ascii=False, separators=(',', ':')
+    )
+    save_corpus_metadata(
+        docs.get_statistics(), file_name, corpus_id, text_column, **kw
+    )
diff --git a/src/eea.corpus/eea/corpus/tests/test_corpus.py b/src/eea.corpus/eea/corpus/tests/test_corpus.py
@@ -25,7 +25,7 @@ def test_build_corpus(self, build_pipeline, corpus_base_path, tmpdir):
         build_corpus(pipeline, corpus_id, file_name, text_column, **kw)
 
         assert path.join('test_eea.json').exists()
-        assert path.join('test_spacy_docs.bin').exists()
+        assert path.join('test_docs.json').exists()
         assert path.join('test_metadatas.json').exists()
         assert path.join('test_info.json').exists()
 
@@ -38,7 +38,7 @@ def test_build_corpus(self, build_pipeline, corpus_base_path, tmpdir):
             assert meta == {
                 'description': 'something else',
                 'title': 'first corpus',
-                'metadata': {
+                'statistics': {
                     'docs': 1,
                     'tokens': 2,
                     'sentences': 1,

diff --git a/src/eea.corpus/eea/corpus/tests/test_utils.py b/src/eea.corpus/eea/corpus/tests/test_utils.py
@@ -42,18 +42,24 @@ def test_valid_document_name(self, is_valid_document):
         req.matchdict = {'doc': 'first'}
         is_valid_document.return_value = True
         assert document_name(req) == 'first'
-
-    def test_is_safe_to_save(self):
-        from eea.corpus.utils import is_safe_to_save
-        from pandas import read_csv
-        from pkg_resources import resource_filename
-        from textacy.doc import Doc
-
-        fpath = resource_filename('eea.corpus', 'tests/fixtures/broken.csv')
-        text_col = read_csv(fpath)['text']
-
-        assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
-        assert is_safe_to_save(Doc(text_col[0], lang='en')) is False
+    #
+    # def test_is_safe_to_save(self):
+    #     from eea.corpus.utils import is_safe_to_save
+    #     from pandas import read_csv
+    #     from pkg_resources import resource_filename
+    #     from textacy.doc import Doc
+    #
+    #     fpath = resource_filename('eea.corpus', 'tests/fixtures/broken.csv')
+    #     text_col = read_csv(fpath)['text']
+    #
+    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
+    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
 
 
 class TestConvertorDecorators:

diff --git a/src/eea.corpus/eea/corpus/utils.py b/src/eea.corpus/eea/corpus/utils.py
@@ -100,31 +100,31 @@ def available_corpus(file_name):
 
 
 def get_corpus(request, doc=None, corpus_id=None):
-    cache = request.corpus_cache
     if not (doc and corpus_id):
         doc, corpus_id = extract_corpus_id(request)
 
     corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
     return corpus
 
-    if corpus_id not in cache.get(doc, []):
-        corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
-
-        if corpus is None:
-            return None
-
-        cache[doc] = {
-            corpus_id: corpus
-        }
-
-    try:
-        return cache[doc][corpus_id]
-    except:
-        import pdb; pdb.set_trace()
+    # cache = request.corpus_cache
+    # if corpus_id not in cache.get(doc, []):
+    #     corpus = load_corpus(file_name=doc, corpus_id=corpus_id)
+    #
+    #     if corpus is None:
+    #         return None
+    #
+    #     cache[doc] = {
+    #         corpus_id: corpus
+    #     }
+    #
+    # try:
+    #     return cache[doc][corpus_id]
+    # except:
+    #     import pdb; pdb.set_trace()
 
 
 def corpus_metadata_path(file_name, corpus_id):
-    """ Returns the zzz_eea.json file path for a given doc/corpus
+    """ Returns the <corpusid>_eea.json file path for a given doc/corpus
     """
     cpath = corpus_base_path(file_name)      # corpus_id
     meta_name = "{0}_eea.json".format(corpus_id)
@@ -165,7 +165,8 @@ def available_documents(request):
 
             for corpus, cfs in files.items():
                 if len(cfs) != 4:
-                    logger.warning("Not a valid corpus: %s (%s)", name, corpus)
+                    # logger.warning("Not a valid corpus: %s (%s)", name,
+                    # corpus)
                     continue
                 meta = load_corpus_metadata(name, corpus)
                 corpuses.append((corpus, meta))
@@ -179,15 +180,6 @@ def available_documents(request):
     return res
 
 
-def metadata(corpus):
-    return {
-        'docs': corpus.n_docs,
-        'sentences': corpus.n_sents,
-        'tokens': corpus.n_tokens,
-        'lang': corpus.spacy_lang.lang,
-    }
-
-
 def extract_corpus_id(request):
     """ Extract document name (aka file_name) from request
     """
@@ -272,23 +264,25 @@ def tokenize(phrase, delimiter='_'):
     return delimiter.join(res)
 
 
-def is_safe_to_save(doc):
-    """ Is this doc safe to save?
-
-    For some reason there's a bug in saving/loading spacy Docs. Here we test
-    that the doc can be loaded back from its serialized representation.
-
-    For further reference, see:
-
-        * https://github.com/explosion/spaCy/issues/1045
-        * https://github.com/explosion/spaCy/issues/985
-
-    """
-    text = doc.text[:100]
-    bs = doc.spacy_doc.to_bytes()
-    try:
-        doc.spacy_doc.from_bytes(bs)
-        return True
-    except Exception:
-        logger.warning("Will not save %s, it will not be loadable", text)
-        return False
+# from spacy.tokens.doc import Doc as SpacyDoc
+# def is_safe_to_save(doc):
+#     """ Is this doc safe to save?
+#
+#     For some reason there's a bug in saving/loading spacy Docs. Here we test
+#     that the doc can be loaded back from its serialized representation.
+#
+#     For further reference, see:
+#
+#         * https://github.com/explosion/spaCy/issues/1045
+#         * https://github.com/explosion/spaCy/issues/985
+#
+#     """
+#     text = doc.text[:100]
+#     vocab = doc.spacy_vocab
+#     bs = doc.spacy_doc.to_bytes()
+#     try:
+#         SpacyDoc(vocab).from_bytes(bs)
+#         return True
+#     except Exception:
+#         logger.warning("Will not save %s, it will not be loadable", text)
+#         return False