mnist re-testing...

Chunk
1 parent 3ef6ddf1
Showing 3 changed files with 221 additions and 31 deletions Show diff stats
mmodel/theano/THEANO.py
mmodel/theano/theanoutil.py
test/test_model.py
@@ -37,38 +37,223 @@ class ModelTHEANO(ModelBase):
         self.sparker = sc
         self.model = None
-    def _shared_dataset(self, data_xy, borrow=True):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared(np.asarray(data_x,
-                                            dtype=theano.config.floatX),
-                                 borrow=borrow)
-        shared_y = theano.shared(np.asarray(data_y,
-                                            dtype=theano.config.floatX),
-                                 borrow=borrow)
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast(shared_y, 'int32')
-
-    def _train_cnn(self, X=None, Y=None, dataset=os.path.join(package_dir, '../../res/', 'ils_crop.pkl'),
+    def _train_cnn(self, X=None, Y=None, dataset=os.path.join(package_dir, '../../res/', 'mnist.pkl.gz'),
                    learning_rate=0.1, n_epochs=200,
                    nkerns=[20, 50, 50],
                    batch_size=400):
-        return train_cnn_example(X, Y, dataset=dataset, learning_rate=learning_rate, n_epochs=n_epochs, nkerns=nkerns,
-                                 batch_size=batch_size)
+        # return train_cnn_example(X, Y, dataset=dataset, learning_rate=learning_rate, n_epochs=n_epochs, nkerns=nkerns,
+        #                          batch_size=batch_size)
+
+        with gzip.open(dataset, 'rb') as f:
+            train_set, valid_set, test_set = cPickle.load(f)
+
+        train_set_x, train_set_y = shared_dataset(train_set)
+        valid_set_x, valid_set_y = shared_dataset(valid_set)
+        test_set_x, test_set_y = shared_dataset(test_set)
+
+        # compute number of minibatches for training, validation and testing
+        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
+        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
+        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
+        n_train_batches /= batch_size
+        n_valid_batches /= batch_size
+        n_test_batches /= batch_size
+
+        print train_set_x.get_value(borrow=True).shape, train_set_y.get_value(borrow=True).shape
+
+        rng = np.random.RandomState(12306)
+        index = T.lscalar()  # index to a [mini]batch
+        # start-snippet-1
+        x = T.matrix('x')   # the data is presented as rasterized images
+        y = T.ivector('y')  # the labels are presented as 1D vector of
+                            # [int] labels
+
+        ######################
+        # BUILD ACTUAL MODEL #
+        ######################
+        print '... building the model'
+
+        layer0_input = x.reshape((batch_size, 1, 28, 28))
+
+        # Construct the first convolutional pooling layer:
+        # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
+        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
+        # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
+        layer0 = ConvPoolLayer(
+            rng,
+            input=layer0_input,
+            image_shape=(batch_size, 1, 28, 28),
+            filter_shape=(nkerns[0], 1, 5, 5),
+            poolsize=(2, 2)
+        )
+
+        # Construct the second convolutional pooling layer
+        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
+        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
+        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
+        layer1 = ConvPoolLayer(
+            rng,
+            input=layer0.output,
+            image_shape=(batch_size, nkerns[0], 12, 12),
+            filter_shape=(nkerns[1], nkerns[0], 5, 5),
+            poolsize=(2, 2)
+        )
+
+        # the HiddenLayer being fully-connected, it operates on 2D matrices of
+        # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
+        # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
+        # or (500, 50 * 4 * 4) = (500, 800) with the default values.
+        layer2_input = layer1.output.flatten(2)
+
+        # construct a fully-connected sigmoidal layer
+        layer2 = HiddenLayer(
+            rng,
+            input=layer2_input,
+            n_in=nkerns[1] * 4 * 4,
+            n_out=500,
+            activation=T.tanh
+        )
+
+        # classify the values of the fully-connected sigmoidal layer
+        layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
+
+        # the cost we minimize during training is the NLL of the model
+        cost = layer3.negative_log_likelihood(y)
+
+        # create a function to compute the mistakes that are made by the model
+        test_model = theano.function(
+            [index],
+            layer3.errors(y),
+            givens={
+                x: test_set_x[index * batch_size: (index + 1) * batch_size],
+                y: test_set_y[index * batch_size: (index + 1) * batch_size]
+            }
+        )
+
+        validate_model = theano.function(
+            [index],
+            layer3.errors(y),
+            givens={
+                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
+                y: valid_set_y[index * batch_size: (index + 1) * batch_size]
+            }
+        )
+
+        # create a list of all model parameters to be fit by gradient descent
+        params = layer3.params + layer2.params + layer1.params + layer0.params
+
+        # create a list of gradients for all model parameters
+        grads = T.grad(cost, params)
+
+        # train_model is a function that updates the model parameters by
+        # SGD Since this model has many parameters, it would be tedious to
+        # manually create an update rule for each model parameter. We thus
+        # create the updates list by automatically looping over all
+        # (params[i], grads[i]) pairs.
+        updates = [
+            (param_i, param_i - learning_rate * grad_i)
+            for param_i, grad_i in zip(params, grads)
+        ]
+
+        train_model = theano.function(
+            [index],
+            cost,
+            updates=updates,
+            givens={
+                x: train_set_x[index * batch_size: (index + 1) * batch_size],
+                y: train_set_y[index * batch_size: (index + 1) * batch_size]
+            }
+        )
+        # end-snippet-1
+
+        ###############
+        # TRAIN MODEL #
+        ###############
+        print '... training'
+        # early-stopping parameters
+        patience = 10000  # look as this many examples regardless
+        patience_increase = 2  # wait this much longer when a new best is
+                               # found
+        improvement_threshold = 0.995  # a relative improvement of this much is
+                                       # considered significant
+        validation_frequency = min(n_train_batches, patience / 2)
+                                      # go through this many
+                                      # minibatche before checking the network
+                                      # on the validation set; in this case we
+                                      # check every epoch
+
+        best_validation_loss = np.inf
+        best_iter = 0
+        test_score = 0.
+        start_time = time.clock()
+
+        epoch = 0
+        done_looping = False
+
+        while (epoch < n_epochs) and (not done_looping):
+            epoch = epoch + 1
+            for minibatch_index in xrange(n_train_batches):
+
+                iter = (epoch - 1) * n_train_batches + minibatch_index
+
+                if iter % 100 == 0:
+                    print 'training @ iter = ', iter
+                cost_ij = train_model(minibatch_index)
+
+                if (iter + 1) % validation_frequency == 0:
+
+                    # compute zero-one loss on validation set
+                    validation_losses = [validate_model(i) for i
+                                         in xrange(n_valid_batches)]
+                    this_validation_loss = np.mean(validation_losses)
+                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
+                          (epoch, minibatch_index + 1, n_train_batches,
+                           this_validation_loss * 100.))
+
+                    # if we got the best validation score until now
+                    if this_validation_loss < best_validation_loss:
+
+                        #improve patience if loss improvement is good enough
+                        if this_validation_loss < best_validation_loss *  \
+                           improvement_threshold:
+                            patience = max(patience, iter * patience_increase)
+
+                        # save best validation score and iteration number
+                        best_validation_loss = this_validation_loss
+                        best_iter = iter
+
+                        # test it on the test set
+                        test_losses = [
+                            test_model(i)
+                            for i in xrange(n_test_batches)
+                        ]
+                        test_score = np.mean(test_losses)
+                        print(('     epoch %i, minibatch %i/%i, test error of '
+                               'best model %f %%') %
+                              (epoch, minibatch_index + 1, n_train_batches,
+                               test_score * 100.))
+
+                if patience <= iter:
+                    done_looping = True
+                    break
+
+        end_time = time.clock()
+        print('Optimization complete.')
+        print('Best validation score of %f %% obtained at iteration %i, '
+              'with test performance %f %%' %
+              (best_validation_loss * 100., best_iter + 1, test_score * 100.))
+        print >> sys.stderr, ('The code for file ' +
+                              os.path.split(__file__)[1] +
+                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
+
+
+
+
+
+
+
+
     def train(self, X, Y):
@@ -168,7 +168,7 @@ class ConvPoolLayer(object):
         self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
         self.params = [self.W, self.b]
-def _shared_dataset(data_xy, borrow=True):
+def shared_dataset(data_xy, borrow=True):
     """ Function that loads the dataset into shared variables
     The reason we store our dataset in shared variables is to allow
@@ -208,8 +208,8 @@ def train_cnn_example(X=None, Y=None, dataset=os.path.join(&#39;&#39;, &#39;../../res/&#39;, &#39;il
     else:
         X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0)
-    X_train, Y_train = _shared_dataset((X_train, Y_train))
-    X_test, Y_test = _shared_dataset((X_test, Y_test))
+    X_train, Y_train = shared_dataset((X_train, Y_train))
+    X_test, Y_test = shared_dataset((X_test, Y_test))
     # X_train = theano.shared(np.asarray(X_train, dtype=theano.config.floatX), borrow=True)
     # Y_train = theano.shared(np.asarray(Y_train, dtype=theano.config.floatX), borrow=True)
@@ -149,6 +149,11 @@ def test_SVM_ILSVRC_S():
     # test_SVM_ILSVRC_SPARK()
+def test_THEANO_mnist():
+    mtheano = THEANO.ModelTHEANO(toolset='cnn')
+    mtheano._train_cnn(learning_rate=0.1, n_epochs=200, dataset=os.path.join(package_dir, '../res/', 'mnist.pkl.gz'), nkerns=[20, 50], batch_size=500)
+
+
 def test_THEANO_crop():
     timer.mark()
     dilc = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_crop_pil')
	@@ -37,38 +37,223 @@ class ModelTHEANO(ModelBase):		@@ -37,38 +37,223 @@ class ModelTHEANO(ModelBase):
37	self.sparker = sc	37	self.sparker = sc
38	self.model = None	38	self.model = None
39		39
40	- def _shared_dataset(self, data_xy, borrow=True):
41	- """ Function that loads the dataset into shared variables
42	-
43	- The reason we store our dataset in shared variables is to allow
44	- Theano to copy it into the GPU memory (when code is run on GPU).
45	- Since copying data into the GPU is slow, copying a minibatch everytime
46	- is needed (the default behaviour if the data is not in a shared
47	- variable) would lead to a large decrease in performance.
48	- """
49	- data_x, data_y = data_xy
50	- shared_x = theano.shared(np.asarray(data_x,
51	- dtype=theano.config.floatX),
52	- borrow=borrow)
53	- shared_y = theano.shared(np.asarray(data_y,
54	- dtype=theano.config.floatX),
55	- borrow=borrow)
56	- # When storing data on the GPU it has to be stored as floats
57	- # therefore we will store the labels as ``floatX`` as well
58	- # (``shared_y`` does exactly that). But during our computations
59	- # we need them as ints (we use labels as index, and if they are
60	- # floats it doesn't make sense) therefore instead of returning
61	- # ``shared_y`` we will have to cast it to int. This little hack
62	- # lets ous get around this issue
63	- return shared_x, T.cast(shared_y, 'int32')
64	-
65	- def _train_cnn(self, X=None, Y=None, dataset=os.path.join(package_dir, '../../res/', 'ils_crop.pkl'),	40	+ def _train_cnn(self, X=None, Y=None, dataset=os.path.join(package_dir, '../../res/', 'mnist.pkl.gz'),
66	learning_rate=0.1, n_epochs=200,	41	learning_rate=0.1, n_epochs=200,
67	nkerns=[20, 50, 50],	42	nkerns=[20, 50, 50],
68	batch_size=400):	43	batch_size=400):
69		44
70	- return train_cnn_example(X, Y, dataset=dataset, learning_rate=learning_rate, n_epochs=n_epochs, nkerns=nkerns,
71	- batch_size=batch_size)	45	+ # return train_cnn_example(X, Y, dataset=dataset, learning_rate=learning_rate, n_epochs=n_epochs, nkerns=nkerns,
		46	+ # batch_size=batch_size)
		47	+
		48	+ with gzip.open(dataset, 'rb') as f:
		49	+ train_set, valid_set, test_set = cPickle.load(f)
		50	+
		51	+ train_set_x, train_set_y = shared_dataset(train_set)
		52	+ valid_set_x, valid_set_y = shared_dataset(valid_set)
		53	+ test_set_x, test_set_y = shared_dataset(test_set)
		54	+
		55	+ # compute number of minibatches for training, validation and testing
		56	+ n_train_batches = train_set_x.get_value(borrow=True).shape[0]
		57	+ n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
		58	+ n_test_batches = test_set_x.get_value(borrow=True).shape[0]
		59	+ n_train_batches /= batch_size
		60	+ n_valid_batches /= batch_size
		61	+ n_test_batches /= batch_size
		62	+
		63	+ print train_set_x.get_value(borrow=True).shape, train_set_y.get_value(borrow=True).shape
		64	+
		65	+ rng = np.random.RandomState(12306)
		66	+ index = T.lscalar() # index to a [mini]batch
		67	+ # start-snippet-1
		68	+ x = T.matrix('x') # the data is presented as rasterized images
		69	+ y = T.ivector('y') # the labels are presented as 1D vector of
		70	+ # [int] labels
		71	+
		72	+ ######################
		73	+ # BUILD ACTUAL MODEL #
		74	+ ######################
		75	+ print '... building the model'
		76	+
		77	+ layer0_input = x.reshape((batch_size, 1, 28, 28))
		78	+
		79	+ # Construct the first convolutional pooling layer:
		80	+ # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
		81	+ # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
		82	+ # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
		83	+ layer0 = ConvPoolLayer(
		84	+ rng,
		85	+ input=layer0_input,
		86	+ image_shape=(batch_size, 1, 28, 28),
		87	+ filter_shape=(nkerns[0], 1, 5, 5),
		88	+ poolsize=(2, 2)
		89	+ )
		90	+
		91	+ # Construct the second convolutional pooling layer
		92	+ # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
		93	+ # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
		94	+ # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
		95	+ layer1 = ConvPoolLayer(
		96	+ rng,
		97	+ input=layer0.output,
		98	+ image_shape=(batch_size, nkerns[0], 12, 12),
		99	+ filter_shape=(nkerns[1], nkerns[0], 5, 5),
		100	+ poolsize=(2, 2)
		101	+ )
		102	+
		103	+ # the HiddenLayer being fully-connected, it operates on 2D matrices of
		104	+ # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
		105	+ # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
		106	+ # or (500, 50 * 4 * 4) = (500, 800) with the default values.
		107	+ layer2_input = layer1.output.flatten(2)
		108	+
		109	+ # construct a fully-connected sigmoidal layer
		110	+ layer2 = HiddenLayer(
		111	+ rng,
		112	+ input=layer2_input,
		113	+ n_in=nkerns[1] * 4 * 4,
		114	+ n_out=500,
		115	+ activation=T.tanh
		116	+ )
		117	+
		118	+ # classify the values of the fully-connected sigmoidal layer
		119	+ layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
		120	+
		121	+ # the cost we minimize during training is the NLL of the model
		122	+ cost = layer3.negative_log_likelihood(y)
		123	+
		124	+ # create a function to compute the mistakes that are made by the model
		125	+ test_model = theano.function(
		126	+ [index],
		127	+ layer3.errors(y),
		128	+ givens={
		129	+ x: test_set_x[index * batch_size: (index + 1) * batch_size],
		130	+ y: test_set_y[index * batch_size: (index + 1) * batch_size]
		131	+ }
		132	+ )
		133	+
		134	+ validate_model = theano.function(
		135	+ [index],
		136	+ layer3.errors(y),
		137	+ givens={
		138	+ x: valid_set_x[index * batch_size: (index + 1) * batch_size],
		139	+ y: valid_set_y[index * batch_size: (index + 1) * batch_size]
		140	+ }
		141	+ )
		142	+
		143	+ # create a list of all model parameters to be fit by gradient descent
		144	+ params = layer3.params + layer2.params + layer1.params + layer0.params
		145	+
		146	+ # create a list of gradients for all model parameters
		147	+ grads = T.grad(cost, params)
		148	+
		149	+ # train_model is a function that updates the model parameters by
		150	+ # SGD Since this model has many parameters, it would be tedious to
		151	+ # manually create an update rule for each model parameter. We thus
		152	+ # create the updates list by automatically looping over all
		153	+ # (params[i], grads[i]) pairs.
		154	+ updates = [
		155	+ (param_i, param_i - learning_rate * grad_i)
		156	+ for param_i, grad_i in zip(params, grads)
		157	+ ]
		158	+
		159	+ train_model = theano.function(
		160	+ [index],
		161	+ cost,
		162	+ updates=updates,
		163	+ givens={
		164	+ x: train_set_x[index * batch_size: (index + 1) * batch_size],
		165	+ y: train_set_y[index * batch_size: (index + 1) * batch_size]
		166	+ }
		167	+ )
		168	+ # end-snippet-1
		169	+
		170	+ ###############
		171	+ # TRAIN MODEL #
		172	+ ###############
		173	+ print '... training'
		174	+ # early-stopping parameters
		175	+ patience = 10000 # look as this many examples regardless
		176	+ patience_increase = 2 # wait this much longer when a new best is
		177	+ # found
		178	+ improvement_threshold = 0.995 # a relative improvement of this much is
		179	+ # considered significant
		180	+ validation_frequency = min(n_train_batches, patience / 2)
		181	+ # go through this many
		182	+ # minibatche before checking the network
		183	+ # on the validation set; in this case we
		184	+ # check every epoch
		185	+
		186	+ best_validation_loss = np.inf
		187	+ best_iter = 0
		188	+ test_score = 0.
		189	+ start_time = time.clock()
		190	+
		191	+ epoch = 0
		192	+ done_looping = False
		193	+
		194	+ while (epoch < n_epochs) and (not done_looping):
		195	+ epoch = epoch + 1
		196	+ for minibatch_index in xrange(n_train_batches):
		197	+
		198	+ iter = (epoch - 1) * n_train_batches + minibatch_index
		199	+
		200	+ if iter % 100 == 0:
		201	+ print 'training @ iter = ', iter
		202	+ cost_ij = train_model(minibatch_index)
		203	+
		204	+ if (iter + 1) % validation_frequency == 0:
		205	+
		206	+ # compute zero-one loss on validation set
		207	+ validation_losses = [validate_model(i) for i
		208	+ in xrange(n_valid_batches)]
		209	+ this_validation_loss = np.mean(validation_losses)
		210	+ print('epoch %i, minibatch %i/%i, validation error %f %%' %
		211	+ (epoch, minibatch_index + 1, n_train_batches,
		212	+ this_validation_loss * 100.))
		213	+
		214	+ # if we got the best validation score until now
		215	+ if this_validation_loss < best_validation_loss:
		216	+
		217	+ #improve patience if loss improvement is good enough
		218	+ if this_validation_loss < best_validation_loss * \
		219	+ improvement_threshold:
		220	+ patience = max(patience, iter * patience_increase)
		221	+
		222	+ # save best validation score and iteration number
		223	+ best_validation_loss = this_validation_loss
		224	+ best_iter = iter
		225	+
		226	+ # test it on the test set
		227	+ test_losses = [
		228	+ test_model(i)
		229	+ for i in xrange(n_test_batches)
		230	+ ]
		231	+ test_score = np.mean(test_losses)
		232	+ print((' epoch %i, minibatch %i/%i, test error of '
		233	+ 'best model %f %%') %
		234	+ (epoch, minibatch_index + 1, n_train_batches,
		235	+ test_score * 100.))
		236	+
		237	+ if patience <= iter:
		238	+ done_looping = True
		239	+ break
		240	+
		241	+ end_time = time.clock()
		242	+ print('Optimization complete.')
		243	+ print('Best validation score of %f %% obtained at iteration %i, '
		244	+ 'with test performance %f %%' %
		245	+ (best_validation_loss * 100., best_iter + 1, test_score * 100.))
		246	+ print >> sys.stderr, ('The code for file ' +
		247	+ os.path.split(__file__)[1] +
		248	+ ' ran for %.2fm' % ((end_time - start_time) / 60.))
		249	+
		250	+
		251	+
		252	+
		253	+
		254	+
		255	+
		256	+
72		257
73		258
74	def train(self, X, Y):	259	def train(self, X, Y):