Reputation: 11
I'm quite new to ML and I'm trying to do a linear regression with quite a simple dataset: text
I did two different regression, one by hand and the other one using sci kit learn, where in the latter I achieved a good fit.
In my code though, I tried doing gradient descent using different learning rates to find the best fitted model. The issue I'm having is that, the final cost function is too high and fitted curve is really bad. From my understanding, I'm having a high bias but I don't see how can I fix it, since I don't have more features and the data structure is clearly linear.
I also applied L2 changing the regularization parameter but I don't find a way to get a better fit. Please I need help to understand whats going on here:
# for array computations and loading data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import copy, math
def compute_cost(X, y, w, b):
m = X.shape[0]
cost = 0.0
for i in range(m):
f_wb_i = np.dot(X[i], w) + b
cost = cost + (f_wb_i - y[i])**2
cost = cost / (2 * m)
return cost
def compute_gradient(X, y, w, b, lambda=0.0):
m = X.shape[0]
dj_dw = 0
dj_db = 0
# compute the gradient with L2 regularization
for i in range(m):
f_wb_i = np.dot(X[i], w) + b
error = (f_wb_i - y[i])
dj_dw += error * X[i] + lambda*w/m
dj_db += error
dj_dw = dj_dw/m
dj_db = dj_db/m
return dj_dw, dj_db
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda=0.0):
"""
Performs batch gradient descent to learn w and b. Updates w and b by taking
num_iters gradient steps with learning rate alpha
Args:
X (ndarray (m,n)) : Data, m examples with n features
y (ndarray (m,)) : target values
w_in (ndarray (n,)) : initial model parameters
b_in (scalar) : initial model parameter
cost_function : function to compute cost
gradient_function : function to compute the gradient
alpha (float) : Learning rate
num_iters (int) : number of iterations to run gradient descent
Returns:
w (ndarray (n,)) : Updated values of parameters
b (scalar) : Updated value of parameter
"""
# An array to store cost J and w's at each iteration primarily for graphing later
J_history = []
w = copy.deepcopy(w_in) #avoid modifying global w within function
b = b_in
epsilon = 1e-6
for i in range(num_iters):
# Calculate the gradient and update the parameters
dj_db,dj_dw = gradient_function(X, y, w, b, lambda) ##None
# Update Parameters using w, b, alpha and gradient
w = w - alpha * dj_dw ##None
b = b - alpha * dj_db ##None
#if cost is not decreasing break
if i>3 and abs(J_history[-1] - J_history[-2]) < epsilon:
break
# Save cost J at each iteration
if i<100000: # prevent resource exhaustion
J_history.append( cost_function(X, y, w, b))
# Print cost every at intervals 10 times or as many iterations if < 10
if i% math.ceil(num_iters / 10) == 0:
print(f"Iteration {i}: Cost {J_history[-1]}")
return w, b, J_history #return final w,b and J history for graphing
def z_score_normalize(X):
"""
Normalizes the features in X.
"""
mu = np.mean(X)
sigma = np.std(X)
X_norm = (X - mu) / sigma
return X_norm, mu, sigma
#-----------Load the dataset from the csv file-----------
filename = r'MyProjects\datasets\linear_simple.csv'
df = pd.read_csv(filename, delimiter=',')
# display first part info
print(df.head())
# get header
header = list(df)
print(header)
#--------------Preprocessing data----------------------
# get class set and features set
X = df.iloc[:, 1]
Y = df.iloc[:, 0]
print("Shape of new dataframes - {} , {}".format(X.shape, Y.shape))
#--------------Exploratory data analysis----------------------
# get statistics of the data
print(X.describe())
print(Y.describe())
# ------------ split data into train and test-----------------------
print("-------------------- SPLIT DATA-------------------------")
X_train, x_, y_train, y_ = train_test_split(X,Y, test_size=0.4, random_state=1)
# get cross validation and test sets
X_cv, X_test, y_cv, y_test = train_test_split(x_,y_,test_size=0.5, random_state=1)
del x_, y_
X_train= np.array(X_train)
y_train= np.array(y_train)
X_cv= np.array(X_cv)
y_cv= np.array(y_cv)
X_test= np.array(X_test)
y_test= np.array(y_test)
#print shapes
print("Shape of X_train and y_train - {} , {}".format(X_train.shape, y_train.shape))
print("Shape of X_cv and y_cv - {} , {}".format(X_cv.shape, y_cv.shape))
print("Shape of X_test and y_test - {} , {}".format(X_test.shape, y_test.shape))
#-------------- Normalize the data----------------------
X_train, mu, sigma = z_score_normalize(X_train)
X_cv = (X_cv - mu) / sigma
X_test = (X_test - mu) / sigma
#--------------Visualize the data----------------------
plt.scatter(X_train, y_train, color='red')
plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
#-------------- compute cost function----------------------
#w_init = np.zeros(1,dtype=float)
w_init = np.array([0],dtype=float)
b_init = 0
print("w_init and b_init - {} , {}".format(w_init, b_init))
# compute cost
J_test = compute_cost(X_train, y_train, w_init, b_init)
print(J_test)
#-------------- compute gradient -------------------
# compute cost
dj_dw, dj_db = compute_gradient(X_train, y_train, w_init, b_init)
print("dj_dw and dj_db - {} , {}".format(dj_dw, dj_db))
# -------------compute gradient descent---------------
alpha = [7e-3, 8e-4,8e-4, 1e-3 ,1e-2]
iter = 1000
lambda_ = 0.1
w_finals=[]
b_finals=[]
J_hists=[]
J_test = float('inf')
# run gradient descent for different alpha
for a in alpha:
print(f"Running gradient descent with alpha {a}")
w_final, b_final, J_hist = gradient_descent(X_train, y_train, w_init, b_init,
compute_cost, compute_gradient,
a, iter, lambda_)
w_finals.append(w_final)
b_finals.append(b_final)
J_hists.append(J_hist)
# get best alpha
best_alpha = alpha[J_hists.index(min(J_hists))]
print(f"Best alpha is {best_alpha}")
# get best w and b
w_final = w_finals[J_hists.index(min(J_hists))]
b_final = b_finals[J_hists.index(min(J_hists))]
print(f"Best w is {w_final} and best b is {b_final}")
# plot cost versus iteration
"""plt.plot(J_hist)
plt.title("Cost vs. iteration")
plt.ylabel('Cost')
plt.xlabel('iteration step')
plt.show()"""
#plot the data and the line
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, w_final*X_train + b_final, color='blue')
plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
I tried:
I want to minimize J and get a good fit of the data, and understand what I'm doing wrong
Upvotes: 0
Views: 21