Reputation: 23
I am trying to reflect almost the same results as sklearn would give but I am not getting good results. The values of intercepts from my custom implementation and sklearn's implementation have a difference of 5, so I am trying to reduce this value here as much as possible as I can.
My code with sklearn is below:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=50000, n_features=15, n_informative=10, n_redundant=5,
n_classes=2, weights=[0.7], class_sep=0.7, random_state=15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')
clf.fit(X=X_train, y=y_train) # fitting our model
print(clf.coef_, clf.coef_.shape, clf.intercept_)
This results in
(array([[-0.42336692, 0.18547565, -0.14859036, 0.34144407, -0.2081867 ,
0.56016579, -0.45242483, -0.09408813, 0.2092732 , 0.18084126,
0.19705191, 0.00421916, -0.0796037 , 0.33852802, 0.02266721]]),
(1, 15),
array([-0.8531383]))
My custom implementation
def initialize_weights(dim):
''' In this function, we will initialize our weights and bias'''
#initialize the weights to zeros array of (dim,1) dimensions
#you use zeros_like function to initialize zero
#initialize bias to zero
w = np.zeros_like(dim)
b = 0
return w,b
def sigmoid(z):
''' In this function, we will return sigmoid of z'''
# compute sigmoid(z) and return
return 1/(1+np.exp(-z))
def logloss(y_true,y_pred):
'''In this function, we will compute log loss '''
loss = 0
A = list(zip(y_true, y_pred))
for y, y_score in A:
loss += (-1/len(A))*(y*np.log10(y_score) + (1-y) * np.log10(1-y_score))
return loss
def gradient_dw(x,y,w,b,alpha,N):
'''In this function, we will compute the gardient w.r.to w '''
z = np.dot(w, x) + b
dw = x*(y - sigmoid(z)) - ((1/alpha)*(1/N) * w)
return dw
def gradient_db(x,y,w,b):
z = np.dot(w, x) + b
db = y - sigmoid(z)
return DB
def train(X_train,y_train,X_test,y_test,epochs,alpha,eta0, tol=1e-3):
''' In this function, we will implement logistic regression'''
#Here eta0 is learning rate
#implement the code as follows
# initalize the weights (call the initialize_weights(X_train[0]) function)
w, b = initialize_weights(X_train[0])
# for every epoch
train_loss = []
test_loss = []
for epoch in range(epochs):
# for every data point(X_train,y_train)
for x, y in zip(X_train, y_train):
#compute gradient w.r.to w (call the gradient_dw() function)
dw = gradient_dw(x, y, w, b, alpha, len(X_train))
#compute gradient w.r.to b (call the gradient_db() function)
db = gradient_db(x, y, w, b)
#update w, b
w = w + eta0 * dw
b = b + eta0 * db
# predict the output of x_train[for all data points in X_train] using w,b
y_pred = [sigmoid(np.dot(w, x)) for x in X_train]
#compute the loss between predicted and actual values (call the loss function)
train_loss.append(logloss(y_train, y_pred))
# store all the train loss values in a list
# predict the output of x_test[for all data points in X_test] using w,b
y_pred_test = [sigmoid(np.dot(w, x)) for x in X_test]
print(f"EPOCH: {epoch} Train Loss: {logloss(y_train, y_pred)} Test Loss: {logloss(y_test, y_pred_test)}")
#compute the loss between predicted and actual values (call the loss function)
test_loss.append(logloss(y_test, y_pred_test))
# you can also compare previous loss and current loss if the loss is not updating then stop the process and return w,b
return w,b, train_loss, test_loss
alpha=0.0001
eta0=0.0001
N=len(X_train)
epochs=50
w,b, train_loss, test_loss=train(X_train,y_train,X_test,y_test,epochs,alpha,eta0)
Thew, b results in
(array([-0.22281323, 0.10570237, -0.02506523, 0.16630429, -0.07033019,
0.27985805, -0.27348925, -0.04622113, 0.13212066, 0.05330409,
0.09926212, -0.00791336, -0.02920803, 0.1828124 , 0.03442375]),
-0.8019981458384148)
Please Help.
Upvotes: 1
Views: 1005
Reputation: 1512
In your function gradient_dw ()
the alpha
(i.e. the regularization term) should be in the numerator.
def gradient_dw(x,y,w,b,alpha,N):
'''In this function, we will compute the gardient w.r.to w '''
z = np.dot(w, x) + b
dw = x*(y - sigmoid(z)) - ((alpha)*(1/N) * w)
return dw
As the cost function of the logistic regression with regularization, is
And the gradient descent algorithm becomes following, by taking derivatives of the cost function w.r.t. weights
Another small correction to your code - need to add the intercept b
for calculation of the predicted values array to the following lines
y_pred = [sigmoid(np.dot(w, x) + b) for x in X_train]
y_pred_test = [sigmoid(np.dot(w, x) + b) for x in X_test]
So in the final form of the full code is as below and it would give the difference with the Scikit-learn implementation in the order of 0.001 for all the weights.
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import matplotlib.pyplot as plt
X, y = make_classification(n_samples=50000, n_features=15, n_informative=10, n_redundant=5,
n_classes=2, weights=[0.7], class_sep=0.7, random_state=15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')
clf.fit(X=X_train, y=y_train) # fitting our model
print(clf.coef_, clf.coef_.shape, clf.intercept_)
def initialize_weights(dim):
''' In this function, we will initialize our weights and bias'''
#initialize the weights to zeros array of (dim,1) dimensions
#you use zeros_like function to initialize zero
#initialize bias to zero
w = np.zeros_like(dim)
b = 0
return w,b
def sigmoid(z):
''' In this function, we will return sigmoid of z'''
# compute sigmoid(z) and return
return 1/(1+np.exp(-z))
def logloss(y_true,y_pred):
'''In this function, we will compute log loss '''
loss = 0
A = list(zip(y_true, y_pred))
for y, y_score in A:
loss += (-1/len(A))*(y*np.log10(y_score) + (1-y) * np.log10(1-y_score))
return loss
def gradient_dw(x,y,w,b,alpha,N):
'''In this function, we will compute the gardient w.r.to w '''
z = np.dot(w, x) + b
dw = x*(y - sigmoid(z)) - ((alpha)*(1/N) * w)
return dw
def gradient_db(x,y,w,b):
z = np.dot(w, x) + b
db = y - sigmoid(z)
return db
def train(X_train,y_train,X_test,y_test,epochs,alpha,eta0, tol=1e-3):
''' In this function, we will implement logistic regression'''
#Here eta0 is learning rate
#implement the code as follows
# initalize the weights (call the initialize_weights(X_train[0]) function)
w, b = initialize_weights(X_train[0])
# for every epoch
train_loss = []
test_loss = []
for epoch in range(epochs):
# for every data point(X_train,y_train)
for x, y in zip(X_train, y_train):
#compute gradient w.r.to w (call the gradient_dw() function)
dw = gradient_dw(x, y, w, b, alpha, len(X_train))
#compute gradient w.r.to b (call the gradient_db() function)
db = gradient_db(x, y, w, b)
#update w, b
w = w + eta0 * dw
b = b + eta0 * db
# predict the output of x_train[for all data points in X_train] using w,b
y_pred = [sigmoid(np.dot(w, x)) for x in X_train]
#compute the loss between predicted and actual values (call the loss function)
train_loss.append(logloss(y_train, y_pred))
# store all the train loss values in a list
# predict the output of x_test[for all data points in X_test] using w,b
y_pred_test = [sigmoid(np.dot(w, x)) for x in X_test]
print(f"EPOCH: {epoch} Train Loss: {logloss(y_train, y_pred)} Test Loss: {logloss(y_test, y_pred_test)}")
#compute the loss between predicted and actual values (call the loss function)
test_loss.append(logloss(y_test, y_pred_test))
# you can also compare previous loss and current loss if the loss is not updating then stop the process and return w,b
return w,b, train_loss, test_loss
alpha=0.0001
eta0=0.0001
N=len(X_train)
epochs=50
w,b, train_loss, test_loss=train(X_train,y_train,X_test,y_test,epochs,alpha,eta0)
print("Difference between custom w and Scikit-learn's clf.coef_ ", w - clf.coef_)
print("Difference between custom intercept b and Scikit-learn's clf.intercept_ ", b - clf.intercept_)
Difference between custom w and Scikit-learn's clf.coef_ [[-0.00642552 0.00755955 0.00012041 -0.00335043 -0.01309563 0.00978314
0.00724319 0.00418409 0.0125563 -0.00701162 0.00169655 -0.00480346
-0.00173041 0.00056208 0.00032075]]
Difference between custom intercept b and Scikit-learn's clf.intercept_ [-0.03911387]
Upvotes: 1