-
Notifications
You must be signed in to change notification settings - Fork 1
/
classification.py
165 lines (117 loc) · 6.21 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import storage
import h5py
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Dropout
from flask_jsonpify import json
from numpy import array
from rq import Queue
def load_classifier(dataset_id):
classifier_json = storage.get_nn(dataset_id=dataset_id)
# Load the pre-built classifier and kick off the training
classifier = model_from_json(classifier_json)
return classifier
def load_data(dataset_id):
# Load the data for the fit
data_json = storage.get_processed(dataset_id=dataset_id)
data = json.loads(data_json)
# Convert the arrays to numpy arrays
X_train = array(data.get('X_train'))
y_train = array(data.get('y_train'))
X_validate = array(data.get('X_validate'))
y_validate = array(data.get('y_validate'))
# Get the scaler so we have access to the scaling used on this dataset
scaler = data.get('scaler')
return X_train, y_train, X_validate, y_validate, scaler
def build(dataset_id, additional_hidden_layers=1, include_dropouts=True):
# Load the data we'll use for the neural network
X_train, y_train, X_validate, y_validate, scaler = load_data(dataset_id=dataset_id)
# Get the dimensions of the training data so we can correctly set the input nodes
input_dimensions = X_train.shape[1]
# We only support one output dimension at the moment
output_dimensions = 1
# We use the standard approach for unit determination
units = (input_dimensions + output_dimensions) / 2
classifier = Sequential()
classifier.add(Dense(units=units, kernel_initializer='uniform', activation='relu', input_dim =input_dimensions))
if include_dropouts:
classifier.add(Dropout(rate=0.1))
if additional_hidden_layers >= 1:
counter = 0
# Create additional layers as requested
while counter < additional_hidden_layers:
counter = counter + 1
classifier.add(Dense(units=units, kernel_initializer='uniform', activation='relu'))
if include_dropouts:
classifier.add(Dropout(rate=0.1))
classifier.add(Dense(units=output_dimensions, kernel_initializer='uniform', activation='sigmoid'))
# Get the JSON for the configured classifier
classifier_json = classifier.to_json()
# Before returning the classifier, we need to save it
storage.save_nn(dataset_id=dataset_id, json=classifier_json)
return classifier_json
def fit(dataset_id, batch_size=10, epochs=100):
# Load the data we'll use for the training
X_train, y_train, X_validate, y_validate, scaler = load_data(dataset_id=dataset_id)
# Start the training in a queued thread
result = queue_work(function_to_queue=fit_async,
args=(dataset_id, X_train, y_train, batch_size, epochs),
dataset_id=dataset_id)
return json.dumps(result)
def fit_async(dataset_id, X_train, y_train, batch_size, epochs):
# Load the classifier and start training
classifier = load_classifier(dataset_id=dataset_id)
# Compile with the standard settings as we'll leave setting variation to the optimization
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Kick off the training
classifier.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)
# Save the trained classifier back to the file system
classifier.save('uploads/' + dataset_id + '_nn_trained.h5')
return True
def queue_work(function_to_queue, args, dataset_id):
# Set a default response
response = {'dataset_id': dataset_id, 'job_status': None, 'logs': None}
# Tell RQ what Redis connection to use
queue = Queue(connection=storage.get_database())
# Check to see if this data set is already running
job = queue.fetch_job(dataset_id)
if job is not None:
if job.result is None:
response['job_status'] = 'The training for the provided data set is currently running'
# Get the logs from Redis
logs_json = storage.get_training_logs(dataset_id=dataset_id)
if logs_json is not None:
response['logs'] = json.loads(logs_json)
elif job.result is True:
response['job_status'] = 'The training has been completed'
else:
response['job_status'] = 'Something has gone wrong with the training'
else:
# Gets a list of job IDs from the queue
if queue.job_ids is not None and len(queue.job_ids) > 10:
response['job_status'] = 'The server currently has 10 jobs running. Please try again later'
else:
# Queue the training function as defined in the classification and set an 1 hour timeout
# As the result timeout is set to 1 hour, this means training for a data set can only be performed once
# per hour as the cache will not have cleared the result
queue.enqueue_call(func=function_to_queue, args=args, timeout='1h', job_id=dataset_id, result_ttl=3600)
# Set the response to tell the caller that the training has started
response['job_status'] = 'The training has been started successfully'
return response
def evaluate(X, y, classifier_build_fn, batch_size=10, epochs=100):
classifier = KerasClassifier(build_fn=classifier_build_fn, batch_size=batch_size, epochs=epochs)
accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10, n_jobs=-1)
return {'mean': accuracies.mean(), 'variance':accuracies.std()}
def optimize(dataset_id, X, y, classifier_build_fn, batch_size_options=[25, 32], epochs_options=[100, 500]):
classifier = KerasClassifier(build_fn=classifier_build_fn)
parameters = {'batch_size': batch_size_options,
'epochs': epochs_options,
'optimizer': ['adam', 'rmsprop'],
'dataset_id': [dataset_id]}
grid_search = GridSearchCV(estimator=classifier,
param_grid=parameters,
scoring='accuracy',
cv=10)
grid_search = grid_search.fit(X, y)
return {'best_parameters': grid_search.best_params_, 'best_accuracy': grid_search.best_score_}