Reputation: 331
Trying to get my hands wet with theano and deep nets by starting with a very simple implementation of a three layer feed forward neural network and testing it on the mnist data set.
I am using a rudimentary implementation of stochastic gradient descent to start out with, and the network is not training properly. The parameters of the network are not being updated.
Was wondering if anyone could could point out what I'm doing wrong.
The following code is my lstm module. I've called it that because I planned on implementing lstm networks in the future.
import theano, theano.tensor as T
import numpy as np
from collections import OrderedDict
np_rng = np.random.RandomState(1234)
class FeedForwardLayer(object):
def __init__(self, input_size, hidden_size, activation):
self.input_size = input_size
self.hidden_size = hidden_size
self.activation = activation
self.create_layer()
def create_layer(self):
self.W = create_shared(self.hidden_size, self.input_size, "weight")
self.b = create_shared(self.hidden_size, name="bias")
def activate(self, x):
if x.ndim > 1:
return self.activation(T.dot(self.W, x.T) + self.b[:, None]).T
else:
return self.activation(T.dot(self.W, x) + self.b)
@property
def params(self):
return [self.W, self.b]
@params.setter
def params(self, param_list):
self.W.set_value(param_list[0])
self.b.set_value(param_list[1])
class Network(object):
def __init__(self, input_size, celltype=FeedForwardLayer, layer_sizes=None):
self.input_size = input_size
self.celltype = celltype
self.layer_sizes = layer_sizes
self.create_layers()
def create_layers(self):
self.layers = []
input_size = self.input_size
for layer_size in self.layer_sizes:
self.layers.append(self.celltype(input_size, layer_size, activation=T.nnet.sigmoid))
input_size = layer_size
def forward(self, x):
out = []
layer_input = x
for layer in self.layers:
layer_out = layer.activate(layer_input)
out.append(layer_out)
layer_input = layer_out
return out
@property
def params(self):
return [param for layer in self.layers for param in layer.params]
@params.setter
def params(self, param_list):
start = 0
for layer in self.layers:
end = start + len(layer.params)
layer.params = param_list[start:end]
start = end
def create_shared(m, n=None, name=None):
if n is None:
return theano.shared(np_rng.standard_normal((m, )), name=name)
else:
return theano.shared(np_rng.standard_normal((m, n)), name=name)
def optimization_updates(cost, params, lr=.01):
"""
implements stochastic gradient descent
Inputs
---------------
cost -- theano variable to minimize
params -- network weights to take gradient with respect to
lr -- learning rate
"""
lr = theano.shared(np.float64(lr).astype(theano.config.floatX))
gparams = T.grad(cost, params)
updates = OrderedDict()
for gparam, param in zip(gparams, params):
updates[param] = param - lr * gparam
return updates
The following code is where I create, train, and test a simple three-layer feed forward network on the mnist data set.
from lstm import Network
import theano, theano.tensor as T
import numpy as np
import lstm as L
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer
# load and normalize dataset
digits = load_digits()
X = digits.data
y = digits.target
X -= X.min()
X /= X.max()
# create network
model = Network(64, layer_sizes=[100, 10])
# prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
data = T.vector()
result = model.forward(data)[-1]
label = T.vector()
cost = (result - label).norm(L=2)
updates = L.optimization_updates(cost, model.params)
update = theano.function([data, label], cost, updates=updates, allow_input_downcast=True)
predict = theano.function([data], result, allow_input_downcast=True)
for X, y in zip(X_train, labels_train):
c = update(X, y)
predictions = []
for X in X_test:
prediction = predict(X)
predictions.append(np.argmax(prediction))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
The problem I'm facing is that the parameters are not being updated properly. I'm not sure if that's because I'm not calculating the gradient properly, or If I'm not using the theano function correctly.
Upvotes: 1
Views: 258
Reputation: 356
You have to make more than one pass on the dataset when using stochastic gradient descent. It is not unusual that the classification error and the confusion matrix do not change much during the first epoch, especially if the dataset is small.
I made the following change in your code to train for 100 epochs
for i in xrange(100):
for X, y in zip(X_train, labels_train):
c = update(X, y)
The confusion matrix seems to have started improving:
[[ 0 0 18 0 13 4 5 0 5 0]
[ 0 42 0 2 0 0 0 0 2 0]
[ 0 0 51 0 0 0 0 1 0 0]
[ 0 0 0 45 0 1 0 1 2 0]
[ 0 0 0 0 33 0 0 0 0 0]
[ 0 0 0 0 0 47 0 0 0 0]
[ 0 0 0 0 0 0 45 0 0 0]
[ 0 0 0 0 1 0 0 48 0 0]
[ 0 2 1 0 0 0 0 0 34 0]
[ 0 1 0 25 0 3 0 2 16 0]]
Upvotes: 2