Merge pull request #135 from aigamedev/cost

Dataset Masking: Per-Sample Training Weight
aigamedev · Nov 22, 2015 · 666a68e · 666a68e
2 parents e1c4f4b + 8593537
commit 666a68e
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 35 deletions.
diff --git a/sknn/backend/lasagne/mlp.py b/sknn/backend/lasagne/mlp.py
@@ -40,7 +40,7 @@ def __init__(self, spec):
         self.f = None
         self.trainer = None
         self.validator = None
-        self.cost = None
+        self.regularizer = None
 
     def _create_mlp_trainer(self, params):
         # Aggregate all regularization parameters into common dictionaries.
@@ -56,17 +56,19 @@ def _create_mlp_trainer(self, params):
                 self.regularize = 'L2'
             penalty = getattr(lasagne.regularization, self.regularize.lower())
             regularize = lasagne.regularization.apply_penalty
-            self.cost = sum(layer_decay[s.name] * regularize(l.get_params(tags={'regularizable': True}), penalty)
-                                for s, l in zip(self.layers, self.mlp))
+            self.regularizer = sum(layer_decay[s.name] * regularize(l.get_params(tags={'regularizable': True}), penalty)
+                                   for s, l in zip(self.layers, self.mlp))
 
         cost_functions = {'mse': 'squared_error', 'mcc': 'categorical_crossentropy'}
         loss_type = self.loss_type or ('mcc' if self.is_classifier else 'mse')
         assert loss_type in cost_functions,\
                     "Loss type `%s` not supported by Lasagne backend." % loss_type
         self.cost_function = getattr(lasagne.objectives, cost_functions[loss_type])
-        cost_symbol = self.cost_function(self.network_output, self.data_output).mean()
-        if self.cost is not None:
-            cost_symbol = cost_symbol + self.cost
+        cost_symbol = self.cost_function(self.network_output, self.data_output)
+        cost_symbol = lasagne.objectives.aggregate(cost_symbol.T, self.data_mask, mode='mean')
+
+        if self.regularizer is not None:
+            cost_symbol = cost_symbol + self.regularizer
         return self._create_trainer(params, cost_symbol)
 
     def _create_trainer(self, params, cost):
@@ -81,8 +83,9 @@ def _create_trainer(self, params, cost):
             raise NotImplementedError(
                 "Learning rule type `%s` is not supported." % self.learning_rule)
 
-        trainer = theano.function([self.data_input, self.data_output], cost,
+        trainer = theano.function([self.data_input, self.data_output, self.data_mask], cost,
                                    updates=self._learning_rule,
+                                   on_unused_input='ignore',
                                    allow_input_downcast=True)
 
         compare = self.cost_function(self.network_output, self.data_correct).mean()
@@ -135,9 +138,10 @@ def _create_layer(self, name, layer, network):
                                          num_units=layer.units,
                                          nonlinearity=self._get_activation(layer))
 
-    def _create_mlp(self, X):
+    def _create_mlp(self, X, w=None):
         self.data_input = T.tensor4('X') if self.is_convolution else T.matrix('X')
         self.data_output = T.matrix('y')
+        self.data_mask = T.vector('m') if w is not None else T.scalar('m')
         self.data_correct = T.matrix('yp')
 
         lasagne.random.get_rng().seed(self.random_state)
@@ -183,12 +187,12 @@ def _create_mlp(self, X):
         self.network_output = lasagne.layers.get_output(network, deterministic=True)
         self.f = theano.function([self.data_input], self.network_output, allow_input_downcast=True)
 
-    def _initialize_impl(self, X, y=None):
+    def _initialize_impl(self, X, y=None, w=None):
         if self.is_convolution:
             X = numpy.transpose(X, (0, 3, 1, 2))
 
         if self.mlp is None:            
-            self._create_mlp(X)
+            self._create_mlp(X, w)
 
         # Can do partial initialization when predicting, no trainer needed.
         if y is None:
@@ -220,7 +224,7 @@ def _predict_impl(self, X):
             X = numpy.transpose(X, (0, 3, 1, 2))
         return self.f(X)
 
-    def _iterate_data(self, X, y, batch_size, shuffle=False):
+    def _iterate_data(self, batch_size, X, y, w, shuffle=False):
         def cast(array):
             if type(array) != numpy.ndarray:
                 array = array.todense()
@@ -233,22 +237,26 @@ def cast(array):
 
         for start_idx in range(0, total_size - batch_size + 1, batch_size):
             excerpt = indices[start_idx:start_idx + batch_size]
-            Xb, yb = cast(X[excerpt]), cast(y[excerpt])
-
-            yield Xb, yb
+            Xb, yb, wb = cast(X[excerpt]), cast(y[excerpt]), None
+            if w is not None:
+                wb = cast(w[excerpt])
+            yield Xb, yb, wb
 
     def _print(self, text):
         if self.verbose:
             sys.stdout.write(text)
             sys.stdout.flush()
 
-    def _batch_impl(self, X, y, processor, mode, output, shuffle):
+    def _batch_impl(self, X, y, w, processor, mode, output, shuffle):
         progress, batches = 0, X.shape[0] / self.batch_size
         loss, count = 0.0, 0
-        for Xb, yb in self._iterate_data(X, y, self.batch_size, shuffle):
+        for Xb, yb, wb in self._iterate_data(self.batch_size, X, y, w, shuffle):
             self._do_callback('on_batch_start', locals())
-
-            loss += processor(Xb, yb)
+
+            if mode == 'train':
+                loss += processor(Xb, yb, wb if wb is not None else 1.0)
+            else:
+                loss += processor(Xb, yb)
             count += 1
 
             while count / batches > progress / 60:
@@ -260,11 +268,11 @@ def _batch_impl(self, X, y, processor, mode, output, shuffle):
         self._print('\r')
         return loss / count
 
-    def _train_impl(self, X, y):
-        return self._batch_impl(X, y, self.trainer, mode='train', output='.', shuffle=True)
+    def _train_impl(self, X, y, w=None):
+        return self._batch_impl(X, y, w, self.trainer, mode='train', output='.', shuffle=True)
 
-    def _valid_impl(self, X, y):
-        return self._batch_impl(X, y, self.validator, mode='valid', output=' ', shuffle=False)
+    def _valid_impl(self, X, y, w=None):
+        return self._batch_impl(X, y, w, self.validator, mode='valid', output=' ', shuffle=False)
 
     @property
     def is_initialized(self):

diff --git a/sknn/mlp.py b/sknn/mlp.py
@@ -31,14 +31,14 @@ class MultiLayerPerceptron(NeuralNetwork, sklearn.base.BaseEstimator):
     def _setup(self):
         pass
 
-    def _initialize(self, X, y=None):
+    def _initialize(self, X, y=None, w=None):
         assert not self.is_initialized,\
             "This neural network has already been initialized."
         self._create_specs(X, y)
 
         backend.setup()
         self._backend = backend.MultiLayerPerceptronBackend(self)
-        return self._backend._initialize_impl(X, y)
+        return self._backend._initialize_impl(X, y, w)
 
     def _check_layer(self, layer, required, optional=[]):
         required.extend(['name', 'type'])
@@ -129,7 +129,7 @@ def _do_callback(self, event, variables):
         else:
             return self.callback(event, **variables)
 
-    def _train(self, X, y):
+    def _train(self, X, y, w=None):
         assert self.n_iter or self.n_stable,\
             "Neither n_iter nor n_stable were specified; training would loop forever."
 
@@ -143,7 +143,7 @@ def _train(self, X, y):
             self._do_callback('on_epoch_start', locals())
 
             is_best_train = False
-            avg_train_error = self._backend._train_impl(X, y)
+            avg_train_error = self._backend._train_impl(X, y, w)
             if avg_train_error is not None:
                 if math.isnan(avg_train_error):
                     raise RuntimeError("Training diverged and returned NaN.")
@@ -196,14 +196,14 @@ def _train(self, X, y):
         self._do_callback('on_train_finish', locals())
         self._backend._array_to_mlp(best_params, self._backend.mlp)
 
-    def _fit(self, X, y):
+    def _fit(self, X, y, w=None):
         assert X.shape[0] == y.shape[0],\
             "Expecting same number of input and output samples."
         data_shape, data_size = X.shape, X.size+y.size
         X, y = self._reshape(X, y)
 
         if not self.is_initialized:
-            X, y = self._initialize(X, y)
+            X, y = self._initialize(X, y, w)
 
         log.info("Training on dataset of {:,} samples with {:,} total size.".format(data_shape[0], data_size))
         if data_shape[1:] != X.shape[1:]:
@@ -224,7 +224,7 @@ def _fit(self, X, y):
                       "\n------------------------------------------------------------")
 
         try:
-            self._train(X, y)
+            self._train(X, y, w)
         except RuntimeError as e:
             log.error("\n{}{}{}\n\n{}\n".format(
                 ansi.RED,
@@ -262,7 +262,7 @@ class Regressor(MultiLayerPerceptron, sklearn.base.RegressorMixin):
     # Regressor compatible with sklearn that wraps various NN implementations.
     # The constructor and bulk of documentation is inherited from MultiLayerPerceptron.
 
-    def fit(self, X, y):
+    def fit(self, X, y, w=None):
         """Fit the neural network to the given continuous data as a regression problem.
 
         Parameters
@@ -283,7 +283,7 @@ def fit(self, X, y):
         if self.valid_set is not None:
             self.valid_set = self._reshape(*self.valid_set)
 
-        return super(Regressor, self)._fit(X, y)
+        return super(Regressor, self)._fit(X, y, w)
 
     def predict(self, X):
         """Calculate predictions for specified inputs.
@@ -322,7 +322,7 @@ def _setup(self):
         import sklearn.preprocessing.label as spl
         spl.type_of_target = lambda _: "multiclass"
 
-    def fit(self, X, y):
+    def fit(self, X, y, w=None):
         """Fit the neural network to symbolic labels as a classification problem.
 
         Parameters
@@ -369,7 +369,7 @@ def fit(self, X, y):
             self.valid_set = (X_v, y_vp)
 
         # Now train based on a problem transformed into regression.
-        return super(Classifier, self)._fit(X, yp)
+        return super(Classifier, self)._fit(X, yp, w)
 
     def partial_fit(self, X, y, classes=None):
         if y.ndim == 1:

diff --git a/sknn/tests/test_data.py b/sknn/tests/test_data.py
@@ -1,10 +1,11 @@
+import random
 import unittest
-from nose.tools import (assert_in, assert_raises, assert_equals, assert_true)
+from nose.tools import (assert_greater, assert_less, assert_raises, assert_equals, assert_true)
 
 import logging
 
 import numpy
-from sknn.mlp import Regressor as MLPR
+from sknn.mlp import Regressor as MLPR, Classifier as MLPC
 from sknn.mlp import Layer as L, Convolution as C
 
 
@@ -88,3 +89,87 @@ def test_SetLayerParamsDict(self):
         p = nn.get_parameters()
         assert_true((p[1].weights.astype('float32') == weights.astype('float32')).all())
         assert_true((p[1].biases.astype('float32') == biases.astype('float32')).all())
+
+
+class TestMaskedDataRegression(unittest.TestCase):
+
+    def check(self, a_in, a_out, a_mask):
+        nn = MLPR(layers=[L("Linear")], learning_rule='adam', n_iter=50)
+        nn.fit(a_in, a_out, a_mask)
+        v_out = nn.predict(a_in)
+
+        # Make sure the examples weighted 1.0 have low error, 0.0 high error.
+        print(abs(a_out - v_out).T * a_mask)
+        assert_true((abs(a_out - v_out).T * a_mask < 5E-2).all())
+        assert_true((abs(a_out - v_out).T * (1.0 - a_mask) > 5E-1).any())
+
+    def test_SingleOutputOne(self):
+        a_in = numpy.random.uniform(-1.0, +1.0, (8,16))
+        a_out = numpy.random.randint(2, size=(8,1)).astype(numpy.float32)
+        a_mask = (0.0 + a_out).flatten()
+
+        self.check(a_in, a_out, a_mask)
+
+    def test_SingleOutputZero(self):
+        a_in = numpy.random.uniform(-1.0, +1.0, (8,16))
+        a_out = numpy.random.randint(2, size=(8,1)).astype(numpy.float32)
+        a_mask = (1.0 - a_out).flatten()
+
+        self.check(a_in, a_out, a_mask)
+
+    def test_SingleOutputNegative(self):
+        a_in = numpy.random.uniform(-1.0, +1.0, (8,16))
+        a_out = numpy.random.randint(2, size=(8,1)).astype(numpy.float32)
+        a_mask = (0.0 + a_out).flatten()
+        a_out = -1.0 * 2.0 + a_out
+
+        self.check(a_in, a_out, a_mask)
+
+    def test_MultipleOutputRandom(self):
+        a_in = numpy.random.uniform(-1.0, +1.0, (8,16))
+        a_out = numpy.random.randint(2, size=(8,4)).astype(numpy.float32)
+        a_mask = numpy.random.randint(2, size=(8,)).astype(numpy.float32)
+
+        self.check(a_in, a_out, a_mask)
+
+
+class TestMaskedDataClassification(unittest.TestCase):
+
+    def check(self, a_in, a_out, a_mask, act='Softmax'):
+        nn = MLPC(layers=[L(act)], learning_rule='rmsprop', n_iter=100)
+        nn.fit(a_in, a_out, a_mask)
+        print(nn.classes_)
+        return nn.predict_proba(a_in)
+
+    def test_TwoLabelsOne(self):
+        # Only one sample has the value 1 with weight 1.0, but all 0s are weighted 0.0.
+        a_in = numpy.random.uniform(-1.0, +1.0, (16,4))
+        a_out = numpy.zeros((16,1), dtype=numpy.int32)
+        a_out[0] = 1
+        a_mask = (0.0 + a_out).flatten()
+
+        a_test = self.check(a_in, a_out, a_mask).mean(axis=0)
+        assert_greater(a_test[1], a_test[0] * 1.5)
+
+    def test_TwoLabelsZero(self):
+        # Only one sample has the value 0 with weight 1.0, but all 1s are weighted 0.0. 
+        a_in = numpy.random.uniform(-1.0, +1.0, (16,4))
+        a_out = numpy.ones((16,1), dtype=numpy.int32)
+        a_out[-1] = 0
+        a_mask = (1.0 - a_out).flatten()
+
+        a_test = self.check(a_in, a_out, a_mask).mean(axis=0)
+        assert_greater(a_test[0], a_test[1] * 1.5)
+
+    def test_FourLabels(self):
+        # Only multi-label sample has weight 1.0, the others have weight 0.0. Check probabilities!
+        chosen = random.randint(0,16)
+        a_in = numpy.random.uniform(-1.0, +1.0, (16,4))
+        a_out = numpy.random.randint(2, size=(16,4))
+        a_mask = numpy.zeros((16,), dtype=numpy.int32)
+        a_mask[chosen] = 1.0
+
+        a_test = self.check(a_in, a_out, a_mask, act="Sigmoid").mean(axis=0)
+        for i in range(a_out.shape[1]):
+            compare = assert_greater if a_out[chosen][i]==0 else assert_less
+            compare(a_test[i*2], a_test[i*2+1])