Reputation: 33
My error value increases when I subtract the gradient x learning rate (of 0.5) from each parameter.
Shouldn't the error value be decreasing when I do so?
Code:
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
params = {}
params["W1"] = np.random.rand(5,20)
params["W2"] = np.random.rand(20,10)
params["b1"] = np.zeros(20)
params["b2"] = np.zeros(10)
# creating training data values
S = np.array([[1,0,0,1,0],
[1,0,0,0,1],
[1,0,0,1,1],
[0,0,1,1,0],
[0,1,1,0,0],
[0,1,1,0,0],
[0,0,0,0,1],
[0,0,1,0,0],
[1,0,0,0,1],
[1,1,1,0,1]])
# creating actual classification values for the training data
simple_array = np.array([0,1,2,3,4,5,6,7,8,9])
t_train = np.zeros((simple_array.size, simple_array.max()+1), dtype=int)
t_train[np.arange(simple_array.size),simple_array] = 1
def predict(S):
#find a1
a1 = np.dot(S, params["W1"]) + params["b1"]
# calculate z1 = relu(a1)
z1 = []
for row in a1:
row0 = []
for element in row:
row0.append(max(0.0, element))
z1.append(row)
z1 = np.array(z1)
# calculate a2 = b2 + z1*W2
a2 = np.dot(z1,params["W2"]) + params["b2"]
# calculate z2 = softmax(a2)
z2 = []
for i in range(len(a2)):
row = a2[i] - np.max(a2[i])
summation = np.sum(np.exp(row))
row = np.exp(row)/summation
z2.append(row)
z2 = np.array(z2)
return z2
def loss(S):
predictions = predict(S) #10*10 array
error = -np.sum(t_train*np.log(predictions + 1e-7))
return error
loss_list = []
loss_list.append(loss(S))
#finding numerical derivative and updating parameters
h = 0.0000001
for m in range(100):
for i in range(int(params["W1"].shape[0])):
for j in range(int(params["W1"].shape[1])):
params["W1"][i][j] += h
bef = loss(S)
params["W1"][i][j] -= h*2
aft = loss(S)
params["W1"][i][j] += h
deriv = (aft - bef)/(2*h)
params["W1"][i][j] -= 0.5*deriv
for i in range(int(params["W2"].shape[0])):
for j in range(int(params["W2"].shape[1])):
params["W2"][i][j] += h
bef = loss(S)
params["W2"][i][j] -= h*2
aft = loss(S)
params["W2"][i][j] += h
deriv = (aft - bef)/(2*h)
params["W2"][i][j] -= 0.5*deriv
for i in range(int(params["b1"].shape[0])):
params["b1"][i] += h
bef = loss(S)
params["b1"][i] -= h*2
aft = loss(S)
params["b1"][i] += h
deriv = (aft - bef)/(2*h)
params["b1"][i] -= 0.5*deriv
for i in range(int(params["b2"].shape[0])):
params["b2"][i] += h
bef = loss(S)
params["b2"][i] -= h*2
aft = loss(S)
params["b2"][i] += h
deriv = (aft - bef)/(2*h)
params["b2"][i] -= 0.5*deriv
loss_list.append(loss(S))
plt.plot(np.array(loss_list))
This is the graph I get: Plot obtained by subtracting gradient x learning rate from parameters
Also, when I add the gradient x learning rate to parameters, I get a decreasing loss function. I am not sure why, as I would expect the opposite to happen.
Plot obtained by adding gradient x learning rate to parameters
Upvotes: 2
Views: 100
Reputation: 48
The main reason why your loss is increasing is that you calculate the opposite of the symmetric difference quotient.
In your code, you have:
params["W1"][i][j] += h
bef = loss(S)
params["W1"][i][j] -= h*2
aft = loss(S)
params["W1"][i][j] += h
deriv = (aft - bef)/(2*h)
params["W1"][i][j] -= 0.5*deriv
If you switch the second and fourth lines (and do the same for other hyperparameters), the loss starts to decrease.
Some other problems include not actually calculating the ReLU activation function (you're appending row
instead of row0
to the new array) and I'm not sure if changing the value of the parameter while calculating its gradient is correct. Besides that, the code looks ready for some hyperparameter tuning :)
Here's my final version:
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
params = {}
params["W1"] = np.random.rand(5, 20)
params["W2"] = np.random.rand(20, 10)
params["b1"] = np.zeros(20)
params["b2"] = np.zeros(10)
# creating training data values
S = np.array(
[
[1, 0, 0, 1, 0],
[1, 0, 0, 0, 1],
[1, 0, 0, 1, 1],
[0, 0, 1, 1, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 0, 0],
[0, 0, 0, 0, 1],
[0, 0, 1, 0, 0],
[1, 0, 0, 0, 1],
[1, 1, 1, 0, 1],
]
)
# creating actual classification values for the training data
simple_array = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
t_train = np.zeros((simple_array.size, simple_array.max() + 1), dtype=int)
t_train[np.arange(simple_array.size), simple_array] = 1
def predict(S):
# find a1
a1 = np.dot(S, params["W1"]) + params["b1"]
# calculate z1 = relu(a1)
z1 = []
for row in a1:
row0 = []
for element in row:
row0.append(max(0.0, element))
z1.append(row0)
z1 = np.array(z1)
assert np.all(z1 >= 0.0)
# calculate a2 = b2 + z1*W2
a2 = np.dot(z1, params["W2"]) + params["b2"]
# calculate z2 = softmax(a2)
z2 = []
for i in range(len(a2)):
row = a2[i] - np.max(a2[i])
summation = np.sum(np.exp(row))
row = np.exp(row) / summation
z2.append(row)
z2 = np.array(z2)
assert np.allclose(np.sum(z2, axis=1), 1.0)
return z2
def loss(S):
predictions = predict(S) # 10*10 array
error = -np.sum(t_train * np.log(predictions + 1e-7))
return error
loss_list = []
loss_list.append(loss(S))
# finding numerical derivative and updating parameters
h = 0.0000001
alpha = 0.5
for m in range(100):
W1 = params["W1"].copy()
for i in range(int(params["W1"].shape[0])):
for j in range(int(params["W1"].shape[1])):
params["W1"][i][j] += h
aft = loss(S)
params["W1"][i][j] -= h * 2
bef = loss(S)
params["W1"][i][j] += h
deriv = (aft - bef) / (2 * h)
W1[i][j] = params["W1"][i][j] - alpha * deriv
params["W1"] = W1
W2 = params["W2"].copy()
for i in range(int(params["W2"].shape[0])):
for j in range(int(params["W2"].shape[1])):
params["W2"][i][j] += h
aft = loss(S)
params["W2"][i][j] -= h * 2
bef = loss(S)
params["W2"][i][j] += h
deriv = (aft - bef) / (2 * h)
W2[i][j] = params["W2"][i][j] - alpha * deriv
params["W2"] = W2
b1 = params["b1"].copy()
for i in range(int(params["b1"].shape[0])):
params["b1"][i] += h
aft = loss(S)
params["b1"][i] -= h * 2
bef = loss(S)
params["b1"][i] += h
deriv = (aft - bef) / (2 * h)
b1[i] = params["b1"][i] - alpha * deriv
params["b1"] = b1
b2 = params["b2"].copy()
for i in range(int(params["b2"].shape[0])):
params["b2"][i] += h
aft = loss(S)
params["b2"][i] -= h * 2
bef = loss(S)
params["b2"][i] += h
deriv = (aft - bef) / (2 * h)
b2[i] = params["b2"][i] - alpha * deriv
params["b2"] = b2
loss_epoch = loss(S)
print(m, loss_epoch)
loss_list.append(loss_epoch)
plt.plot(np.array(loss_list))
Upvotes: 1