Mad Wombat
Mad Wombat

Reputation: 15105

Training MLP in Theano

I am a bit stuck trying to train a pretty standard MLP model using Theano. My model code looks like this

class Layer(object):
    def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax):
        def weights(shape):
            return np.array(np.random.uniform(size=shape), dtype='float64')
        def biases(size):
            return np.zeros((size), dtype='float64')
        
        self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True)
        self.b = theano.shared(value=biases(n_out), name='biases', borrow=True)
        self.output = activation(T.dot(inputs, self.W) + self.b)
        self.pred = T.argmax(self.output, axis=1)
        self.params = [self.W, self.b]

class MLP(object):
    def __init__(self, inputs, n_in, n_hidden, n_out):
        """ for now lets go with one hidden layer"""
        self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh)
        self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default        
    def loss(self, one_hot):
        return T.mean(T.sqr(one_hot - self._output.output)    
    def accuracy(self, y):
        return T.mean(T.eq(self._output.pred, y))    
    def updates(self, loss, rate=0.01):
        updates = []
        updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W)))
        updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b)))
        updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W)))
        updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b)))
        return updates

Then I attempt to train it like this

x = T.matrix('x', dtype='float64')
y = T.vector('y', dtype='int32')

# basic logistic model
# model = Layer(x, 784, 10, activation=T.nnet.softmax)
# basic multi-layer perceptron
model = MLP(x, 784, 128, 10)

labels = T.extra_ops.to_one_hot(y, 10)
# loss function
#loss = T.mean(T.sqr(labels - model.output))
loss = model.loss(labels)
# average number of correct predictions over a batch
#accuracy = T.mean(T.eq(model.pred, y))
accuracy = model.accuracy(y)

# updates
#rate = 0.05
#g_W = T.grad(cost=loss, wrt=model.W)
#g_b = T.grad(cost=loss, wrt=model.b)
#updates = [(model.W, model.W - rate * g_W),
#           (model.b, model.b - rate * g_b)]
updates = model.updates(loss, rate=0.3)

# batch index
index = T.scalar('batch index', dtype='int32')
size = T.scalar('batch size', dtype='int32')

train = theano.function([index, size], 
                        [loss, accuracy],
                        updates=updates,
                        givens={x: train_set[0][index * size: (index + 1) * size],
                                y: train_set[1][index * size: (index + 1) * size]})

valid = theano.function([index, size], 
                        [loss, accuracy],
                        givens={x: valid_set[0][index * size: (index + 1) * size],
                                y: valid_set[1][index * size: (index + 1) * size]})

test = theano.function([index, size], 
                       [accuracy],
                       givens={x: test_set[0][index * size: (index + 1) * size],
                               y: test_set[1][index * size: (index + 1) * size]})

n_epochs = 10
batch_size = 500
# number of items in training dataset / batch size
batches_in_epoch = datasets[0][0].shape[0] // batch_size

losses = np.empty(0)
errors = np.empty(0)

for epoch in range(1, n_epochs + 1):
    epoch_losses = np.empty(0)
    epoch_errors = np.empty(0)
    for batch_n in range(batches_in_epoch):
        l, e = train(batch_n, batch_size)
        epoch_losses = np.append(epoch_losses, l)
        epoch_errors = np.append(epoch_errors, e)
        print('[%s]' % time.ctime(), 
              'epoch: ', epoch, 
              'batch: ', batch_n, 
              'loss: ', np.round(l, 4), 
              'accuracy: ', np.round(e, 4))
    # shuffle train set every epoch
    shuffle = np.arange(datasets[0][1].shape[0])
    np.random.shuffle(shuffle)
    train_set[0] = train_set[0][shuffle]
    train_set[1] = train_set[1][shuffle]
    
    losses = np.concatenate([losses, epoch_losses])
    errors = np.concatenate([errors, epoch_errors])
    valid_l, valid_e = valid(0, datasets[1][0].shape[0])
    print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e)
        
acc = test(0, datasets[2][0].shape[0])
print()
print('Final accuracy: ', np.round(acc, 4)[0])

Now, if you look at the comments, I tried it with a basic logistic regression model and it worked, I got some 80% accuracy. But it doesn't work when I replace it with my MLP model. It doesn't converge to anything and I get 10% accuracy random guesses. What am I doing wrong? The data I am using is the MNIST dataset loaded into shared variables the way Theano tutorials do.

Upvotes: 0

Views: 419

Answers (1)

vkoe
vkoe

Reputation: 381

The problem seems to lie within the weight initialization. How did you do this in your tensorflow implementation?

I'm not too sure about the underlying math right now so correct me if I'm wrong, but I like to interpret it like if all the weights are positive the model fails to learn negative features.

You can try adding low=-1, high=1 to the initialization (default of np.random.uniform is between 0 and 1). In my tests this took pretty long to converge (~100 epochs) but at least it did.

Using the somewhat smarter glorot initialization like this:

def weights(shape):
    return np.random.uniform(low=-np.sqrt(6. / sum(shape)),
                             high=np.sqrt(6. / sum(shape)),
                             size=shape)

makes the training a lot faster. I got about 90% validation accuracy after 5 epochs adding this to your code.

This is also the way weights are initialized in the theano MLP example.

Upvotes: 0

Related Questions