Damian Matkowski
Damian Matkowski

Reputation: 79

What's wrong with my backpropagation?

I'm trying to code a neural network from scratch in python. To check whether everything works I wanted to overfit the network but the loss seems to explode at first and then comes back to the initial value and stops there (Doesn't converge). I've checked my code and could find the reason. I assume my understanding or implementation of backpropagation is incorrect but there might be some other reason. Can anyone help me out or at least point me in the right direction?

# Initialize weights and biases given dimesnsions (For this example the dimensions are set to [12288, 64, 1])
def initialize_parameters(dims):
    # Initiate parameters
    parameters = {} 
    L = len(dims) # Number of layers in the network

    # Loop over the given dimensions. Initialize random weights and set biases to zero.
    for i in range(1, L):
        parameters["W" + str(i)] = np.random.randn(dims[i], dims[i-1]) * 0.01
        parameters["b" + str(i)] = np.zeros([dims[i], 1])

    return parameters

# Activation Functions
def relu(x, deriv=False):
    if deriv:
        return 1. * (x > 0)
    else:
        return np.maximum(0,x)

def sigmoid(x, deriv=False):
    if deriv:
        return x * (1-x)
    else:
        return 1/(1 + np.exp(-x))


# Forward and backward pass for 2 layer neural network. (1st relu, 2nd sigmoid)
def forward_backward(X, Y, parameters):
    # Array for storing gradients
    grads = {}

    # Get the length of examples
    m = Y.shape[1]

    # First layer
    Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
    A1 = relu(Z1)

    # Second layer
    Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
    AL = sigmoid(Z2)

    # Compute cost
    cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))

    # Backpropagation
    # Second Layer
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    dZ2 = dAL * sigmoid(AL, deriv=True)
    grads["dW2"] = np.dot(dZ2, A1.T) / m
    grads["db2"] = np.sum(dZ2, axis=1, keepdims=True) / m

    # First layer
    dA1 = np.dot(parameters["W2"].T, dZ2)
    dZ1 = dA1 * relu(A1, deriv=True)
    grads["dW1"] = np.dot(dZ1, X.T)
    grads["db1"] = np.sum(dZ1, axis=1, keepdims=True) / m

    return AL, grads, cost

# Hyperparameters
dims = [12288, 64, 1]
epoches = 2000
learning_rate = 0.1

# Initialize parameters
parameters = initialize_parameters(dims)
log_list = []

# Train the network
for i in range(epoches):
    # Get X and Y
    x = np.array(train[0:10],ndmin=2).T
    y = np.array(labels[0:10], ndmin=2).T

    # Perform forward and backward pass
    AL, grads, cost = forward_backward(x, y, parameters)

    # Compute cost and append to the log_list
    log_list.append(cost)

    # Update parameters with computed gradients 
    parameters = update_parameters(grads, parameters, learning_rate)

plt.plot(log_list)
plt.title("Loss of the network")
plt.show()

Loss of the network

Upvotes: 0

Views: 548

Answers (1)

Aleksei Maide
Aleksei Maide

Reputation: 1855

I am struggling to find the place where you calculate the error gradients and the input training data sample would also help...

I don't know if this will help you, but I'll share my solution for Python neural network to learn XOR problem.

import numpy as np


def sigmoid_function(x, derivative=False):
    """
    Sigmoid function
    “x” is the input and “y” the output, the nonlinear properties of this function means that
    the rate of change is slower at the extremes and faster in the centre. Put plainly,
    we want the neuron to “make its mind up” instead of indecisively staying in the middle.
    :param x: Float
    :param Derivative: Boolean
    :return: Float
    """
    if (derivative):
        return x * (1 - x)  # Derivative using the chain rule.
    else:
        return 1 / (1 + np.exp(-x))


# create dataset for XOR problem
input_data = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
ideal_output = np.array([[0.0], [1.0], [1.0], [0.0]])


#initialize variables
learning_rate = 0.1
epoch = 50000 #number or iterations basically - One round of forward and back propagation is called an epoch

# get the second element from the numpy array shape field to detect the count of features for input layer
input_layer_neurons = input_data.shape[1]
hidden_layer_neurons = 3 #number of hidden layer neurons
output_layer_neurons = 1 #number of output layer neurons

#init weight & bias
weights_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
bias_hidden = np.random.uniform(1, hidden_layer_neurons)
weights_output = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
bias_output = np.random.uniform(1, output_layer_neurons)

for i in range(epoch):

    #forward propagation
    hidden_layer_input_temp = np.dot(input_data, weights_hidden) #matrix dot product to adjust for weights in the layer
    hidden_layer_input = hidden_layer_input_temp + bias_hidden #adjust for bias
    hidden_layer_activations = sigmoid_function(hidden_layer_input) #use the activation function
    output_layer_input_temp = np.dot(hidden_layer_activations, weights_output)
    output_layer_input = output_layer_input_temp + bias_output
    output = sigmoid_function(output_layer_input) #final output

    #backpropagation (where adjusting of the weights happens)
    error = ideal_output - output #error gradient
    if (i % 1000 == 0):
        print("Error: {}".format(np.mean(abs(error))))

    #use derivatives to compute slope of output and hidden layers
    slope_output_layer = sigmoid_function(output, derivative=True)
    slope_hidden_layer = sigmoid_function(hidden_layer_activations, derivative=True)

    #calculate deltas
    delta_output = error * slope_output_layer
    error_hidden_layer = delta_output.dot(weights_output.T) #calculates the error at hidden layer
    delta_hidden = error_hidden_layer * slope_hidden_layer

    #change the weights
    weights_output += hidden_layer_activations.T.dot(delta_output) * learning_rate
    bias_output += np.sum(delta_output, axis=0, keepdims=True) * learning_rate
    weights_hidden += input_data.T.dot(delta_hidden) * learning_rate
    bias_hidden += np.sum(delta_hidden, axis=0, keepdims=True) * learning_rate

Upvotes: 1

Related Questions