Reputation: 11
I created a multilayer perceptron from scratch to demonstrate backpropagation. It is meant to identify handwritten digits provided by the MNIST database. However the machine runs into two issues that I can't trace the source of:
Firstly, the machine converges to a state where it reads the same digit every time, regardless of what the initial input array is. The cost (defined as the sum of squared difference between output nodes and ground truth) converges to 1.
Secondly, the output is a one-hot vector, but the highest activation node in the output layer gradually becomes less activated as the machine processes more training examples.
import mnist
import numpy as np
import random as rd
W2 = np.random.rand(256, 784)/784
W3 = np.random.rand(256, 256)/256
W4 = np.random.rand(10, 256)/256
Z1 = np.zeros(784)
Z2 = B2 = np.zeros(256)
Z3 = B3 = np.zeros(256)
Z4 = B4 = np.zeros(10)
test_images = mnist.test_images()
test_labels = mnist.test_labels()
train_images = mnist.train_images()
train_labels = mnist.train_labels()
def sig(x):
return 1/(1+np.exp(-x))
def sigp(x):
return sig(x)*(1-sig(x))
class Network:
# Initialising weight matrices and bias and activation vectors
def __init__(self, Z1, W2, B2, Z2, W3, B3, Z3, W4, B4, Z4):
self.Z1 = Z1
self.W2 = W2
self.B2 = B2
self.Z2 = Z2
self.W3 = W3
self.B3 = B3
self.Z3 = Z3
self.W4 = W4
self.B4 = B4
self.Z4 = Z4
# Quick access to weights, biases and activations
def W(self, n):
if n==4: return self.W4
if n==3: return self.W3
if n==2: return self.W2
def Z(self, n):
if n==4: return self.Z4
if n==3: return self.Z3
if n==2: return self.Z2
if n==1: return self.Z1
def B(self, n):
if n==4: return self.B4
if n==3: return self.B3
if n==2: return self.B2
# Weight and bias modification
def Wmod(self, n, weight_change, learning_rate):
if n==4: self.W4 -= learning_rate*weight_change
elif n==3: self.W3 -= learning_rate*weight_change
elif n==2: self.W2 -= learning_rate*weight_change
return
def Bmod(self, n, delta, learning_rate):
if n==4: self.B4 -= learning_rate*delta
elif n==3: self.B3 -= learning_rate*delta
elif n==2: self.B2 -= learning_rate*delta
return
# Reads the input image
def read(self, input):
self.Z1 = np.reshape(input, 784)
self.Z2 = np.matmul(self.W2, sig(self.Z1)) + B2
self.Z3 = np.matmul(self.W3, sig(self.Z2)) + B3
self.Z4 = np.matmul(self.W4, sig(self.Z3)) + B4
result = np.argmax(sig(self.Z4))
return result
# Calculates cost
def cost(self, truth):
truth -= sig(self.Z4)
return np.dot(truth, truth)
net = Network(Z1, W2, B2, Z2, W3, B3, Z3, W4, B4, Z4)
for k in range(1000):
i=rd.randint(1,len(train_images))
# Creating truth vector
truth = np.zeros(10)
truth[train_labels[i]] = 1
# Reading training input
r = net.read(train_images[i])
print("Training set {}, digit read = {}, actual digit = {} ".format(k,r,train_labels[i]))
# Backpropagation
for j in range(4,1,-1):
if j==4: delta = np.multiply(sig(net.Z4) - truth, sigp(net.Z4))
else: delta = np.multiply(sigp(net.Z(j)), np.matmul(net.W(j+1).T, delta))
weight_change = np.outer(delta, sig(net.Z(j-1)))
net.Wmod(j, weight_change, 0.001)
net.Bmod(j, delta, 0.001)
print("\n")
for i in range(20):
print("Read = {}, Actual = {}".format(net.read(test_images[i]), test_labels[i]))
I tried setting a smaller learning rate and testing up to 10000 training examples but it still converges incorrectly. I've checked through the math but it seems to work correctly, the only inaccuracies being floating point errors and accumulated discrepancy over the weight layers.
Upvotes: 1
Views: 38