Issues when minimizing cost function in a simple linear regression

Question

I'm quite new to ML and I'm trying to do a linear regression with quite a simple dataset: text

I did two different regression, one by hand and the other one using sci kit learn, where in the latter I achieved a good fit.

In my code though, I tried doing gradient descent using different learning rates to find the best fitted model. The issue I'm having is that, the final cost function is too high and fitted curve is really bad. From my understanding, I'm having a high bias but I don't see how can I fix it, since I don't have more features and the data structure is clearly linear.

I also applied L2 changing the regularization parameter but I don't find a way to get a better fit. Please I need help to understand whats going on here:

# for array computations and loading data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import copy, math

def compute_cost(X, y, w, b):
    
    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b
        cost = cost + (f_wb_i - y[i])**2
    cost = cost / (2 * m)
    return cost

def compute_gradient(X, y, w, b, lambda=0.0):
    
    m = X.shape[0]
    dj_dw = 0
    dj_db = 0
    # compute the gradient with L2 regularization
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b
        error = (f_wb_i - y[i])
        dj_dw += error * X[i] + lambda*w/m
        dj_db += error
    
    dj_dw = dj_dw/m
    dj_db = dj_db/m
        
    return dj_dw, dj_db

def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda=0.0): 
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    epsilon = 1e-6
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b, lambda)   ##None

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
        
        #if cost is not decreasing break
        if i>3 and abs(J_history[-1] - J_history[-2]) < epsilon:
            break
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i}: Cost {J_history[-1]}")
        
    return w, b, J_history #return final w,b and J history for graphing

def z_score_normalize(X):
    """
    Normalizes the features in X. 
    """
    mu = np.mean(X)
    sigma = np.std(X)
    X_norm = (X - mu) / sigma
        
    return X_norm, mu, sigma

#-----------Load the dataset from the csv file-----------
filename = r'MyProjects\datasets\linear_simple.csv'
df = pd.read_csv(filename, delimiter=',')

# display first part info
print(df.head())

# get header
header = list(df)
print(header)

#--------------Preprocessing data----------------------
# get class set and features set
X = df.iloc[:, 1]
Y = df.iloc[:, 0]
print("Shape of new dataframes - {} , {}".format(X.shape, Y.shape))

#--------------Exploratory data analysis----------------------
# get statistics of the data
print(X.describe())
print(Y.describe())

# ------------ split data into train and test-----------------------
print("-------------------- SPLIT DATA-------------------------")
X_train, x_, y_train, y_ = train_test_split(X,Y, test_size=0.4, random_state=1)

# get cross validation and test sets
X_cv, X_test, y_cv, y_test = train_test_split(x_,y_,test_size=0.5, random_state=1)

del x_, y_

X_train= np.array(X_train)
y_train= np.array(y_train)
X_cv= np.array(X_cv)
y_cv= np.array(y_cv)
X_test= np.array(X_test)
y_test= np.array(y_test)

#print shapes
print("Shape of X_train and y_train - {} , {}".format(X_train.shape, y_train.shape))
print("Shape of X_cv and y_cv - {} , {}".format(X_cv.shape, y_cv.shape))
print("Shape of X_test and y_test - {} , {}".format(X_test.shape, y_test.shape))

#-------------- Normalize the data----------------------
X_train, mu, sigma = z_score_normalize(X_train)
X_cv = (X_cv - mu) / sigma
X_test = (X_test - mu) / sigma


#--------------Visualize the data----------------------
plt.scatter(X_train, y_train, color='red')
plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

#-------------- compute cost function----------------------
#w_init = np.zeros(1,dtype=float)
w_init = np.array([0],dtype=float)
b_init = 0
print("w_init and b_init - {} , {}".format(w_init, b_init))

# compute cost
J_test = compute_cost(X_train, y_train, w_init, b_init)
print(J_test)

#-------------- compute gradient -------------------
# compute cost
dj_dw, dj_db = compute_gradient(X_train, y_train, w_init, b_init)
print("dj_dw and dj_db - {} , {}".format(dj_dw, dj_db))

# -------------compute gradient descent---------------

alpha = [7e-3, 8e-4,8e-4, 1e-3 ,1e-2]
iter = 1000
lambda_ = 0.1

w_finals=[]
b_finals=[]
J_hists=[]

J_test = float('inf')
# run gradient descent for different alpha 
for a in alpha:
    print(f"Running gradient descent with alpha {a}")
    w_final, b_final, J_hist = gradient_descent(X_train, y_train, w_init, b_init,
                                                compute_cost, compute_gradient, 
                                                a, iter, lambda_)
    w_finals.append(w_final)
    b_finals.append(b_final)
    J_hists.append(J_hist)
    
# get best alpha
best_alpha = alpha[J_hists.index(min(J_hists))]
print(f"Best alpha is {best_alpha}")

# get best w and b
w_final = w_finals[J_hists.index(min(J_hists))]
b_final = b_finals[J_hists.index(min(J_hists))]
print(f"Best w is {w_final} and best b is {b_final}")

# plot cost versus iteration  
"""plt.plot(J_hist)
plt.title("Cost vs. iteration")
plt.ylabel('Cost')
plt.xlabel('iteration step')
plt.show()"""

#plot the data and the line
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, w_final*X_train + b_final, color='blue')
plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

I tried:

normalization
I also applied L2 changing the regularization parameter but I don't find a way to get a better fit.
took the best b,w parameters from the sci kit file (28.97 , 52.35) and applied as initial parameters, it worked well of course.

I want to minimize J and get a good fit of the data, and understand what I'm doing wrong

Issues when minimizing cost function in a simple linear regression

Answers (0)

Related Questions