Cost function is not decreasing in gradient descent implementation

Implementing multiple linear regression in python from scratch. The cost after every epoch is increasing very rapidly, and finally overflowing. What is happening wrong? Is there any logical error or methodical error? Here is the code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class LinearRegression:

    def __init__(self):
        #initialize parameters with none values
        self.cost_history = None
        self.std_dev = None
        self.mean = None
        self.weights = None

    def set_mean_and_std_dev(self, X_data, axis=0):
        #mean and std_deviation of training_data 
        self.std_dev = np.array(np.std(X_data, axis = axis))
        self.mean = np.array(np.mean(X_data, axis = axis))
        print("Mean : ", self.mean)
        print("Standard deviation : ", self.std_dev)
        return

    def normalize(self, X_data):
        #normalizing the data
        X_data = (X_data - self.mean)/(self.std_dev+0.00001)
        return

    def cost(self, X, y):
        #squared error based cost
        return np.sum((np.dot(X, self.weights.T)-y)**2)

    def get_gradient(self, X, y, pred_y, learning_rate):
        grad = np.zeros(shape=(X.shape[1], ), dtype='float64')
        #print("X.shape : ", X.shape, " grad.shape : ", grad.shape, "pred_y.shape", pred_y.shape, "y.shape : ", y.shape)

        for ix in range(X.shape[0]):
            #for each example in X
            x_example = X[ix, :]
            error = (pred_y[ix] - y[ix])

            for jx in range(grad.shape[0]):
                #for each feature of X
                grad[jx] = grad[jx] + x_example[jx]*error
        for ix in range(grad.shape[0]):
            #divide by the number of examples
            grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0])
        return grad


    def gradient_descent(self, X, y, learning_rate=0.001, num_iterations=100):

        self.weights = np.zeros(shape=(X.shape[1],),  dtype='float64')
        for ix in range(X.shape[1]):
            #initialize weights with random values
            self.weights[ix] = np.random.rand()*100

        #initialize cost history
        self.cost_history = []

        for ix in range(num_iterations):
            pred_y = np.dot(X, self.weights.T)
            gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate)
            self.weights = self.weights - gradient
            cost = self.cost(X, y)
            self.cost_history.append(cost)

            if ix%10 == 0:
                print("(learning_rate/(X.shape[0]))*gradient) : ", gradient)            
                print("Cost at ", ix, " epoch is : ", cost)

    def predict(self, X):
        return np.dot(X, self.weights.T)

    def get_weights(self):
        return self.weights

#create a linear regression object
lr = LinearRegression()


# # Some random function generator
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4
def generate_data(X):
    y = np.zeros(shape=(X.shape[0], ), dtype='float64')
    for ix in range(X.shape[0]):
        y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3] 
    return y


X = np.zeros(shape=(300, 5), dtype='float64')
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)]
X[:, 1:] = np.array(data_gen, dtype='float64')

y = generate_data(X[:, 1:])

lr.set_mean_and_std_dev(X)
lr.normalize(X)

X[:, 0] = 1
print(X.shape, y.shape, X.dtype, y.dtype)
print(X[0], y[0])



X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:]
print(y_test)


lr.gradient_descent(X_train, y_train, 0.01, 500)
pred_y = lr.predict(X_test)

Upvotes: 0

Views: 1174

Answers (1)

Huang
Huang

Reputation: 609

There is a mistake in function get_gradient. The index jx is used in the code block for ix:

for ix in range(grad.shape[0]):
    #divide by the number of examples
    grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0])

And you write too much for loops. Instead, you can try this as get_gradient function:

def get_gradient(self, X, y, pred_y, learning_rate):
    errors = (pred_y - y)
    grad = X.T.dot(errors)
    #divide by the number of examples
    grad = grad*learning_rate/X.shape[0]
    return grad

Other problems:

  1. The initial value of weights is too large:

    self.weights = np.zeros(shape=(X.shape[1],),  dtype='float64')
    for ix in range(X.shape[1]):
        #initialize weights with random values
        self.weights[ix] = np.random.rand()*100
    

    You can use smaller initial values and use only one line: self.weights = np.random.rand(X.shape[1])

  2. The learning rate is too large, try : lr.gradient_descent(X_train, y_train, 0.00001, 3000)

  3. Seems like you want to do something with these two functions: set_mean_and_std_dev and normalize. But they did nothing in this code.

The final code:

import numpy as np

class LinearRegression:

    def __init__(self):
        #initialize parameters with none values
        self.cost_history = None
        self.std_dev = None
        self.mean = None
        self.weights = None

    def cost(self, X, y):
        #squared error based cost
        return np.sum((self.predict(X)-y)**2)

    def get_gradient(self, X, y, pred_y, learning_rate):
        errors = (pred_y - y)
        grad = X.T.dot(errors)
        #divide by the number of examples
        grad = grad*learning_rate/X.shape[0]
        return grad


    def gradient_descent(self, X, y,
                         learning_rate=0.001,
                         num_iterations=100):
        #initialize weights with random values
        self.weights = np.random.rand(X.shape[1])
        #initialize cost history
        self.cost_history = []

        for ix in range(num_iterations):
            pred_y = self.predict(X)
            gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate)
            self.weights = self.weights - gradient
            cost = self.cost(X, y)
            self.cost_history.append(cost)

            if ix%10 == 0:
                print("(learning_rate/(X.shape[0]))*gradient) : ", gradient)
                print("Cost at ", ix, " epoch is : ", cost)

    def predict(self, X):
        return np.dot(X, self.weights)


#create a linear regression object
lr = LinearRegression()

# # Some random function generator
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4
def generate_data(X):
    y = np.zeros(shape=(X.shape[0], ), dtype='float64')
    for ix in range(X.shape[0]):
        y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3]
    return y


X = np.zeros(shape=(300, 5), dtype='float64')
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)]
X[:, 1:] = np.array(data_gen, dtype='float64')

y = generate_data(X[:, 1:])

X[:, 0] = 1
print(X.shape, y.shape, X.dtype, y.dtype)
print(X[0], y[0])

X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:]
print(y_test)

lr.gradient_descent(X_train, y_train, 0.00001, 3000)
pred_y = lr.predict(X_test)

Upvotes: 1

Related Questions