Skip to content

Commit

Permalink
Merge branch 'classify_view'
Browse files Browse the repository at this point in the history
  • Loading branch information
tiberiuichim committed Sep 9, 2017
2 parents 5fcd326 + 35d1033 commit 2788c47
Show file tree
Hide file tree
Showing 12 changed files with 344 additions and 31 deletions.
1 change: 1 addition & 0 deletions src/eea.corpus/eea/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def main(global_config, **settings):
config.add_route('corpus_topics', '/topics/{doc}/{corpus}')
config.add_route('delete_corpus', '/delete/{doc}/{corpus}')
config.add_route('process_csv', '/process/{doc}/')
config.add_route('corpus_classify', '/classify/{doc}/{corpus}')
config.add_route('view_job', '/job-view/{doc}/{corpus}/job/{job}')
config.add_route('demo', '/demo')

Expand Down
83 changes: 83 additions & 0 deletions src/eea.corpus/eea/corpus/classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
class ClassVocab:
def __init__(self):
self.vocab = {}

def __getitem__(self, k):
if isinstance(k, float):
k = 'empty'
k = [x for x in k.split('/') if x][0]
if k not in self.vocab:
x = len(self.vocab)
self.vocab[k] = x
return x
return self.vocab[k]


def train_model(corpus):
# conventions: X are features, y are labels
# X_train is array of training feature values,
# X_test is array with test values
# y_train are labels for X_train, y_test are labels for X_test

from sklearn import metrics
from sklearn.model_selection import train_test_split
from itertools import tee

docs = (doc for doc in corpus
if not isinstance(doc.metadata['Category Path'], float))
docs_stream, meta_stream = tee(docs, 2)

print("Transforming docs")
docs = [doc.text for doc in docs_stream]

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(input='content', strip_accents='unicode',
tokenizer=tokenizer, # stop_words='english',
max_features=5000)

X = vect.fit_transform(docs)

from sklearn.feature_extraction.text import TfidfTransformer
transf = TfidfTransformer()
X = transf.fit_transform(X)
# X = X.toarray() # only needed for GDC

# from sklearn.feature_extraction.text import TfidfVectorizer
# vect = TfidfVectorizer(max_features=5000,
# ngram_range=(1, 3), sublinear_tf=True)
# X = vect.fit_transform(docs)

# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=100) # acc: 0.73

# from sklearn import svm
# model = svm.SVC(kernel='poly', degree=3, C=1.0) # acc: 0.66

# from sklearn.naive_bayes import MultinomialNB # acc: 0.73
# model = MultinomialNB(alpha=0.1) # , fit_prior=True

# takes a long time, can go higher if more estimators, higher l_rate
# from sklearn.ensemble import GradientBoostingClassifier # acc: 0.65
# model = GradientBoostingClassifier(n_estimators=10,learning_rate=0.1)

# 0.763 with tfidf from countvect 5000, 0.7 without tfidf
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

vocab = ClassVocab()
y = [vocab[doc.metadata['Category Path']] for doc in meta_stream]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=3311)

print("Training on %s docs" % str(X_train.shape))

model.fit(X_train, y_train)

print("Fitting model")
model.fit(X_train, y_train)
print("done")

pred = model.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print(score)
Empty file.
105 changes: 105 additions & 0 deletions src/eea.corpus/eea/corpus/classify/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from eea.corpus.schema import ClassifficationModelSchema
from eea.corpus.utils import tokenizer
from eea.corpus.corpus import get_corpus
from pyramid.view import view_config
from pyramid_deform import FormView
import pyramid.httpexceptions as exc


@view_config(route_name="corpus_classify",
renderer='eea.corpus:templates/classify.pt')
class CreateClassificationModelView(FormView):
schema = ClassifficationModelSchema()
buttons = ('classify', 'fasttext')

score = None

def corpus(self):
""" Return a corpus based on environment.
It will try to return it from cache, otherwise load it from disk.
If corpus hasn't been extracted from the document, it will redirect to
a corpus creation tool.
"""

corpus = get_corpus(self.request)
if corpus is None:
raise exc.HTTPNotFound()
return corpus

def metadata(self):
""" Show metadata about context document
"""
# TODO: show info about processing and column
corpus = self.corpus()
return {
'docs': corpus.n_docs,
'sentences': corpus.n_sents,
'tokens': corpus.n_tokens,
'lang': corpus.lang,
}

def classify_success(self, appstruct):
corpus = self.corpus()
pass

def fasttext_success(self, appstruct):
from itertools import islice
# from pyfasttext import FastText

corpus = self.corpus()
docs = [doc for doc in corpus
if not isinstance(doc.metadata['Category Path'], float)]

split = int(corpus.n_docs * 0.9) # TODO: should be docs

train_docs = islice(docs, 0, split)
test_docs = islice(docs, split, corpus.n_docs)

print('Writing corpus to disk')
lines = []
for doc in train_docs:
labels = doc.metadata['Category Path'].replace('/', ' __label__')
labels = labels.strip()
# labels = '__label__'+doc.metadata['Category Path'].split('/')[1]
text = doc.text.replace('\n', ' ')
line = " ".join([labels, text])
lines.append(line)

import unicodedata
with open('/tmp/corpus-train.txt', 'wb') as f:
s = '\n'.join(lines)
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
f.write(s)

y_test = []
test_lines = []
with open('/tmp/corpus-test.txt', 'w') as f:
for doc in test_docs:
labels = [x for x in doc.metadata['Category Path'].split('/')
if x]
# labels = '__label__' + \
# doc.metadata['Category Path'].split('/')[1]
test_lines.append(doc.text.replace('\n', ' '))
y_test.append(labels)
f.write('\n'.join(test_lines))

print("Training model")
# model = fasttext.supervised()
import fasttext as ft
model = ft.supervised(input_file='/tmp/corpus-train.txt',
output='/tmp/ftmodel', epoch=100)
print("Model trained")

# from sklearn import metrics
# self.score = metrics.accuracy_score(y_test, pred)

pred = model.predict(test_lines, k=2)
zz = list(zip(pred, y_test))
tt = [x for x in zz if set(x[0]) != set(x[1])]
notok = len(tt)
self.score = notok * 100 / len(zz)
print("Score %s" % self.score)

# xx = model.predict_proba(test_lines, k=2)
# import pdb; pdb.set_trace()
28 changes: 28 additions & 0 deletions src/eea.corpus/eea/corpus/processing/simpletokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from colander import Schema
from eea.corpus.processing import pipeline_component
from eea.corpus.utils import set_text
import logging

logger = logging.getLogger('eea.corpus')

class Tokenizer(Schema):
""" Schema for the Tokenizer processing.
"""

description = "Simple, dumb tokenizer. Strips non-alpha and small words"


@pipeline_component(schema=Tokenizer,
title="Simple text tokenization")
def process(content, env, **settings):
""" Tokenization
"""

for doc in content:
text = " ".join(tokenizer(doc.text))

try:
yield set_text(doc, text)
except Exception:
logger.exception("Error in converting to Doc %r", text)
continue
9 changes: 6 additions & 3 deletions src/eea.corpus/eea/corpus/processing/stopwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@

logger = logging.getLogger('eea.corpus')

dl = nltk.downloader.Downloader()
if not dl.is_installed('stopwords'):
nltk.download('stopwords') # TODO: do this some other way
try:
dl = nltk.downloader.Downloader()
if not dl.is_installed('stopwords'):
nltk.download('stopwords') # TODO: do this some other way
except Exception:
logger.exception("Error when checking for nltk's stopwords data")


class StopWords(Schema):
Expand Down
46 changes: 37 additions & 9 deletions src/eea.corpus/eea/corpus/schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from colander import Int, Schema, SchemaNode, String, Float # , Bool
from colander import Int, Schema, SchemaNode, String, Float, Set
from eea.corpus.processing import pipeline_registry
from eea.corpus.utils import upload_location
import colander
Expand All @@ -14,27 +14,44 @@ def preview_url(self, name):
tmpstore = Store()


def csv_file_columns(request):
md = request.matchdict or {}
name = md.get('doc')
if name:
path = upload_location(name) # TODO: move this to utils
f = pd.read_csv(path)

return [(k, k) for k in f.keys()]


@colander.deferred
def columns_widget(node, kw):
""" A select widget that reads the csv file to show available columns
"""

choices = []
req = kw['request']

md = req.matchdict or {}
name = md.get('doc')
if name:
path = upload_location(name) # TODO: move this to utils
f = pd.read_csv(path)
choices = [('', '')] + [(k, k) for k in f.keys()]
choices = [('', '')] + csv_file_columns(req)

return deform.widget.SelectWidget(
values=choices,
default=''
)


@colander.deferred
def multi_columns_widget(node, kw):
""" A multiselect widget that reads the csv file to show available columns
"""

req = kw['request']
choices = csv_file_columns(req)

return deform.widget.SelectWidget(
values=choices,
multiple=True
)


class UploadSchema(Schema):
# title = SchemaNode(String())
upload = SchemaNode(
Expand Down Expand Up @@ -147,3 +164,14 @@ class CreateCorpusSchema(colander.MappingSchema):
widget=pipeline_components_widget,
title="Add a new pipeline component"
)


class ClassifficationModelSchema(colander.MappingSchema):
""" Schema to build a text classification modle
"""

columns = SchemaNode(
Set(),
widget=multi_columns_widget,
title='Columns with class labels',
)
37 changes: 37 additions & 0 deletions src/eea.corpus/eea/corpus/templates/classify.pt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<div metal:use-macro="load: layout.pt">

<div metal:fill-slot="sidebar-top"></div>
<div metal:fill-slot="sidebar"></div>

<div metal:fill-slot="full-width">
<div class="row">
<div class="col-md-12">
<h1>Create a new classification model</h1>
<p>Set required parameters for the classification model</p>
</div>
</div>

<div class="row">

<div class="col-md-5" id="corpus-form">
<form tal:replace="structure form" ></form>
</div>

<div class="col-md-7">
<div class="content">
<h4>Preview result</h4>
<p>Choose row and processing settings to preview results</p>

<div class="panel panel-default">
<div class="panel-body">
Score: ${view.score}
</div>
</div>

</div>
</div>

</div>
</div>
</div>

2 changes: 1 addition & 1 deletion src/eea.corpus/eea/corpus/templates/home.pt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
</h4>
<a href="/topics/${doc.name}/${corpus[0]}" >Topic Modeling</a>
/
<a href="/categorize/${doc.name}/${corpus[0]}">Categorize</a>
<a href="/classify/${doc.name}/${corpus[0]}">Classification</a>
/
<a href="/view/${doc.name}/${corpus[0]}/0">View</a>

Expand Down
18 changes: 0 additions & 18 deletions src/eea.corpus/eea/corpus/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,6 @@ def test_valid_document_name(self, is_valid_document):
req.matchdict = {'doc': 'first'}
is_valid_document.return_value = True
assert document_name(req) == 'first'
#
# def test_is_safe_to_save(self):
# from eea.corpus.utils import is_safe_to_save
# from pandas import read_csv
# from pkg_resources import resource_filename
# from textacy.doc import Doc
#
# fpath = resource_filename('eea.corpus', 'tests/fixtures/broken.csv')
# text_col = read_csv(fpath)['text']
#
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
# assert is_safe_to_save(Doc(text_col[1], lang='en')) is True


class TestConvertorDecorators:
Expand Down
5 changes: 5 additions & 0 deletions src/eea.corpus/eea/corpus/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,8 @@ def test_apply_schema_edits(self):
]

# TODO: finish test


class TestClassificiationView:
def test_schema(self):
pass
Loading

0 comments on commit 2788c47

Please sign in to comment.