Reputation: 1
Implementing multiple linear regression in python from scratch. The cost after every epoch is increasing very rapidly, and finally overflowing. What is happening wrong? Is there any logical error or methodical error? Here is the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self):
#initialize parameters with none values
self.cost_history = None
self.std_dev = None
self.mean = None
self.weights = None
def set_mean_and_std_dev(self, X_data, axis=0):
#mean and std_deviation of training_data
self.std_dev = np.array(np.std(X_data, axis = axis))
self.mean = np.array(np.mean(X_data, axis = axis))
print("Mean : ", self.mean)
print("Standard deviation : ", self.std_dev)
return
def normalize(self, X_data):
#normalizing the data
X_data = (X_data - self.mean)/(self.std_dev+0.00001)
return
def cost(self, X, y):
#squared error based cost
return np.sum((np.dot(X, self.weights.T)-y)**2)
def get_gradient(self, X, y, pred_y, learning_rate):
grad = np.zeros(shape=(X.shape[1], ), dtype='float64')
#print("X.shape : ", X.shape, " grad.shape : ", grad.shape, "pred_y.shape", pred_y.shape, "y.shape : ", y.shape)
for ix in range(X.shape[0]):
#for each example in X
x_example = X[ix, :]
error = (pred_y[ix] - y[ix])
for jx in range(grad.shape[0]):
#for each feature of X
grad[jx] = grad[jx] + x_example[jx]*error
for ix in range(grad.shape[0]):
#divide by the number of examples
grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0])
return grad
def gradient_descent(self, X, y, learning_rate=0.001, num_iterations=100):
self.weights = np.zeros(shape=(X.shape[1],), dtype='float64')
for ix in range(X.shape[1]):
#initialize weights with random values
self.weights[ix] = np.random.rand()*100
#initialize cost history
self.cost_history = []
for ix in range(num_iterations):
pred_y = np.dot(X, self.weights.T)
gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate)
self.weights = self.weights - gradient
cost = self.cost(X, y)
self.cost_history.append(cost)
if ix%10 == 0:
print("(learning_rate/(X.shape[0]))*gradient) : ", gradient)
print("Cost at ", ix, " epoch is : ", cost)
def predict(self, X):
return np.dot(X, self.weights.T)
def get_weights(self):
return self.weights
#create a linear regression object
lr = LinearRegression()
# # Some random function generator
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4
def generate_data(X):
y = np.zeros(shape=(X.shape[0], ), dtype='float64')
for ix in range(X.shape[0]):
y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3]
return y
X = np.zeros(shape=(300, 5), dtype='float64')
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)]
X[:, 1:] = np.array(data_gen, dtype='float64')
y = generate_data(X[:, 1:])
lr.set_mean_and_std_dev(X)
lr.normalize(X)
X[:, 0] = 1
print(X.shape, y.shape, X.dtype, y.dtype)
print(X[0], y[0])
X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:]
print(y_test)
lr.gradient_descent(X_train, y_train, 0.01, 500)
pred_y = lr.predict(X_test)
Upvotes: 0
Views: 1174
Reputation: 609
There is a mistake in function get_gradient
.
The index jx
is used in the code block for ix
:
for ix in range(grad.shape[0]):
#divide by the number of examples
grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0])
And you write too much for
loops.
Instead, you can try this as get_gradient
function:
def get_gradient(self, X, y, pred_y, learning_rate):
errors = (pred_y - y)
grad = X.T.dot(errors)
#divide by the number of examples
grad = grad*learning_rate/X.shape[0]
return grad
Other problems:
The initial value of weights is too large:
self.weights = np.zeros(shape=(X.shape[1],), dtype='float64')
for ix in range(X.shape[1]):
#initialize weights with random values
self.weights[ix] = np.random.rand()*100
You can use smaller initial values and use only one line: self.weights = np.random.rand(X.shape[1])
The learning rate is too large, try : lr.gradient_descent(X_train, y_train, 0.00001, 3000)
Seems like you want to do something with these two functions: set_mean_and_std_dev
and normalize
.
But they did nothing in this code.
The final code:
import numpy as np
class LinearRegression:
def __init__(self):
#initialize parameters with none values
self.cost_history = None
self.std_dev = None
self.mean = None
self.weights = None
def cost(self, X, y):
#squared error based cost
return np.sum((self.predict(X)-y)**2)
def get_gradient(self, X, y, pred_y, learning_rate):
errors = (pred_y - y)
grad = X.T.dot(errors)
#divide by the number of examples
grad = grad*learning_rate/X.shape[0]
return grad
def gradient_descent(self, X, y,
learning_rate=0.001,
num_iterations=100):
#initialize weights with random values
self.weights = np.random.rand(X.shape[1])
#initialize cost history
self.cost_history = []
for ix in range(num_iterations):
pred_y = self.predict(X)
gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate)
self.weights = self.weights - gradient
cost = self.cost(X, y)
self.cost_history.append(cost)
if ix%10 == 0:
print("(learning_rate/(X.shape[0]))*gradient) : ", gradient)
print("Cost at ", ix, " epoch is : ", cost)
def predict(self, X):
return np.dot(X, self.weights)
#create a linear regression object
lr = LinearRegression()
# # Some random function generator
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4
def generate_data(X):
y = np.zeros(shape=(X.shape[0], ), dtype='float64')
for ix in range(X.shape[0]):
y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3]
return y
X = np.zeros(shape=(300, 5), dtype='float64')
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)]
X[:, 1:] = np.array(data_gen, dtype='float64')
y = generate_data(X[:, 1:])
X[:, 0] = 1
print(X.shape, y.shape, X.dtype, y.dtype)
print(X[0], y[0])
X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:]
print(y_test)
lr.gradient_descent(X_train, y_train, 0.00001, 3000)
pred_y = lr.predict(X_test)
Upvotes: 1