Reputation:
My first neural network was using sigmoid activation function and was working fine. Now I want to switch to more advanced activation function(ReLu). But with ReLu my NN doesn't work at all. 90% of errors, while using sigmoid there were 4% of errors. I can't find bug in code. Help me.
class NeuralNetwork:
def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate = 0.1):
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.learning_rate = learning_rate
self.weights_ih = np.random.normal(0.0, pow(input_nodes, -0.5), (hidden_nodes, input_nodes))
self.weights_ho = np.random.normal(0.0, pow(hidden_nodes, -0.5), (output_nodes, hidden_nodes))
self.bias_h = np.random.normal(0.0, pow(1, -0.5), (hidden_nodes, 1))
self.bias_o = np.random.normal(0.0, pow(1, -0.5), (output_nodes, 1))
def activation_function(self, x):
return x * (x > 0)
def activation_function_d(self, x):
return 1 * (x >= 0)
def train(self, inputs_list, targets_list):
inputs = np.array(inputs_list, ndmin=2).T
targets = np.array(targets_list, ndmin=2).T
# Feedforward
hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
hidden = self.activation_function(hidden_inputs)
output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
outputs = self.activation_function(output_inputs)
# Calculate errors
output_errors = targets - outputs
hidden_errors = np.dot(self.weights_ho.T, output_errors)
# Calculate gradients
output_gradient = output_errors * self.activation_function_d(output_inputs) * self.learning_rate
hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate
# Calculate deltas
output_deltas = np.dot(output_gradient, hidden.T)
hidden_deltas = np.dot(hidden_gradient, inputs.T)
# Adjust weights and biases by deltas and gradients
self.weights_ho += output_deltas
self.weights_ih += hidden_deltas
self.bias_o += output_gradient
self.bias_h += hidden_gradient
def predict(self, inputs_list):
inputs = np.array(inputs_list, ndmin=2).T
hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
outputs = self.activation_function(np.dot(self.weights_ho, hidden) + self.bias_o)
return outputs.flatten().tolist()
And training code:
with open('mnist_train.csv') as train_file:
for str in train_file:
data = [int(char) for char in str.split(',')]
inputs = data[1:]
targets = [1 if i == data[0] else 0 for i in range(10)]
nn.train(inputs, targets)
Upvotes: 0
Views: 2002
Reputation: 66
The last layer should always use sigmoid (in the binary case) regardless of what you are trying to do.
The sigmoid function is used to estimate the probabilities that an example is in a given class, the prediction of an example is the class which the example has the highest probability to be in.
To conclude, change this:
def predict(self, inputs_list):
inputs = np.array(inputs_list, ndmin=2).T
hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
outputs = self.activation_function(np.dot(self.weights_ho, hidden) + self.bias_o)
return outputs.flatten().tolist()
to this
def predict(self, inputs_list):
inputs = np.array(inputs_list, ndmin=2).T
hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
outputs = sigmoid(np.dot(self.weights_ho, hidden) + self.bias_o) // create a sigmoid function
return outputs.flatten().tolist()
and in the training:
# Feedforward
hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
hidden = self.activation_function(hidden_inputs)
output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
outputs = self.activation_function(output_inputs)
to:
# Feedforward
hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
hidden = self.activation_function(hidden_inputs)
output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
outputs = sigmoid(output_inputs)
and
# Calculate gradients
output_gradient = output_errors * self.activation_function_d(output_inputs) * self.learning_rate
hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate
to
# Calculate gradients
output_gradient = output_errors * sigmoid_d(output_inputs) * self.learning_rate
hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate
Upvotes: 1