Neural Network MNIST

Question

I have been studying neural networks now for a while and made an implementation with python and numpy. I made a very simple example with XOR and it worked well. So I thought I go further and try the MNIST database.

There is my problem. I am using a NN with 784 inputs, 30 hidden and 10 output neuron. The activation function of the hidden layer spits out only ones, so that the network basically stops learning. The math that I am doing is correct and the same implementation works well with the XOR example and I am reading the MNIST set properly. So I don't see, where the issue comes from.

import pickle
import gzip

import numpy as np

def load_data():
    f = gzip.open('mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
    f.close()
    return (training_data, validation_data, test_data)

def transform_output(num):
    arr = np.zeros(10)
    arr[num] = 1.0
    return arr

def out2(arr):
    return arr.argmax()


data = load_data()
training_data = data[0]
training_input = np.array(training_data[0])
training_output = [transform_output(y) for y in training_data[1]]

batch_size = 10

batch_count = int(np.ceil(len(training_input) / batch_size))

input_batches = np.array_split(training_input, batch_count)
output_batches = np.array_split(training_output, batch_count)

#Sigmoid Function
def sigmoid (x):
    return 1.0/(1.0 + np.exp(-x))

#Derivative of Sigmoid Function
def derivatives_sigmoid(x):
    return x * (1.0 - x)

#Variable initialization
epoch=1 #Setting training iterations
lr=2.0 #Setting learning rate
inputlayer_neurons = len(training_input[0]) #number of features in data set
hiddenlayer_neurons = 30 #number of hidden layers neurons

output_neurons = len(training_output[0]) #number of neurons at output layer

#weight and bias initialization
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))

for i in range(epoch):
    for batch in range(batch_count):

        X = input_batches[batch]
        y = output_batches[batch]

        zh1 = np.dot(X, wh)
        zh = zh1 + bh

        # data -> hidden neurons -> activations
        ah = sigmoid(zh)

        zo1 = np.dot(ah, wout)
        zo = zo1 + bout

        output = sigmoid(zo)

        # data -> output neurons -> error
        E = y - output

        print("debugging")
        print("X")
        print(X)
        print("WH")
        print(wh)
        print("zh1")
        print(zh1)
        print("bh")
        print(bh)
        print("zh")
        print(zh)
        print("ah")
        print(ah)
        print("wout")
        print(wout)
        print("zo1")
        print(zo1)
        print("bout")
        print(bout)
        print("zo")
        print(zo)
        print("out")
        print(output)
        print("y")
        print(y)
        print("error")
        print(E)
        # data -> output neurons -> slope
        slope_out = derivatives_sigmoid(output)

        # data -> output neurons -> change of error
        d_out = E * slope_out

        # data -> hidden neurons -> error = data -> output neurons -> change of error DOT output neurons -> output inputs (equal to hidden neurons) -> weights
        error_hidden = d_out.dot(wout.T)

        # data -> hidden neurons -> slope
        slope_h = derivatives_sigmoid(ah)

        # data -> hidden neurons -> change of error
        d_hidden = error_hidden * slope_h

        # hidden neurons -> output neurons -> weights = "" + hidden neurons -> data -> activations DOT data -> output neurons -> change of error
        wout = wout + ah.T.dot(d_out) * lr
        bout = bout + np.sum(d_out, axis=0, keepdims=True) * lr

        wh = wh + X.T.dot(d_hidden) * lr
        bh = bh + np.sum(d_hidden, axis=0, keepdims=True) * lr
    # testing results
    X = np.array(data[1][0][0:10])
    zh1 = np.dot(X, wh)
    zh = zh1 + bh

    # data -> hidden neurons -> activations
    ah = sigmoid(zh)

    zo1 = np.dot(ah, wout)
    zo = zo1 + bout

    output = sigmoid(zo)
    print([out2(y) for y in output])
    print(data[1][1][0:10])

so overall the output of the neural network is for every input the same and training it with different batch sizes, learning rates and 100 epochs did not help.

Neural Network MNIST

Answers (1)

Related Questions