Reputation: 406
The goal is to detect and fix why the report between my sklearn "summary" implementation is not matching with the results of OLS
statsmodels. The only thing is matching, is the beta coefficients.
import pandas as pd
import numpy as np
from statsmodels.regression.linear_model import OLS
from sklearn import linear_model
from scipy.stats import t
class LinearRegression(linear_model.LinearRegression):
"""
LinearRegression class after sklearn's, but calculate t-statistics
and p-values for model coefficients (betas).
Additional attributes available after .fit()
are `t` and `p` which are of the shape (y.shape[1], X.shape[1])
which is (n_features, n_coefs)
This class sets the intercept to 0 by default, since usually we include it
in X.
"""
def __init__(self, *args, **kwargs):
if not "fit_intercept" in kwargs:
kwargs['fit_intercept'] = False
super(LinearRegression, self)\
.__init__(*args, **kwargs)
def fit(self, X, y, n_jobs=1):
self = super(LinearRegression, self).fit(X, y, n_jobs)
# std errors
uhat = (y-([email protected]_).ravel())
k = np.shape(X)[1]
s2 = (uhat.T@uhat)/(y.shape[0])
var = s2*np.linalg.inv(X.T@X)
self.se = np.sqrt(np.diag(var))
# T-Stat
self.t_stats = self.coef_/self.se
# p-values
self.df = y.shape[0] - k # -1 degrees of freedom: N minus number of parameters
self.p_values = 2*(t.sf(abs(self.t_stats),self.df))
# Rsquared
tss = (y-np.mean(y)).T@(y-np.mean(y))
rss = uhat.T@uhat
self.rsq = 1 - rss/tss
self.summary = pd.DataFrame({
"beta":self.coef_.reshape(1,-1).tolist()[0],
"se":self.se.reshape(1,-1).tolist()[0],
"t_stats":self.t_stats.reshape(1,-1).tolist()[0],
"p_values":self.p_values.reshape(1,-1).tolist()[0],
})
return self
Running the function in a toy dataset we can test the results:
import statsmodels.api as sm
data = sm.datasets.longley.load_pandas()
# Estimate statsmodels OLS
model = OLS(endog=data.endog,exog=data.exog).fit()
# Estimate Sklearn with report like statsmodels OLS
model2 = LinearRegression(fit_intercept=False).fit(data.exog,np.array(data.endog))
model2.summary
I am worried about some formula is not matching with the correct one.
Upvotes: 3
Views: 382
Reputation: 196
Here you have a class that you can use in order to obtain a LinearRegression model summary using Scikit-learn:
import numpy as np
import pandas as pd
from scipy.stats import t
from sklearn import linear_model
class LinearRegression(linear_model.LinearRegression):
def __init__(self, *args, **kwargs):
if not "fit_intercept" in kwargs:
self.with_intercept = True
kwargs['fit_intercept'] = True
else:
self.with_intercept = False
kwargs['fit_intercept'] = False
super(LinearRegression, self).__init__(*args, **kwargs)
def fit(self, X, y):
self = super(LinearRegression, self).fit(X, y)
y_pred = self.predict(X)
residuals = y - y_pred
residual_sum_of_squares = residuals.T @ residuals
if self.with_intercept:
coefficients = [coef for coef in self.coef_]
self.coefficients = [self.intercept_] + coefficients
p = len(X.columns) + 1 # plus one because an intercept is added
new_X = np.empty(shape=(len(X), p), dtype=float)
new_X[:, 0] = 1
new_X[:, 1:p] = X.values
else:
self.coefficients = self.coef_
p = len(X.columns)
new_X = X.values
# standard errors
sigma_squared_hat = residual_sum_of_squares / (len(X) - p)
var_beta_hat = np.linalg.inv(new_X.T @ new_X) * sigma_squared_hat
self.std_errors = [var_beta_hat[p_, p_] ** 0.5 for p_ in range(p)]
# t_values
self.t_values = np.array(self.coefficients)/self.std_errors
# p values
freedom_degree = y.shape[0] - X.shape[1]
self.p_values = 2*(t.sf(abs(self.t_values), freedom_degree))
# summary
self.summary = pd.DataFrame()
self.summary['Coefficients'] = self.coefficients
self.summary['Standard errors'] = self.std_errors
self.summary['t values'] = self.t_values
self.summary['p values'] = self.p_values
return self
This is how you can try the class and fit a model:
import statsmodels.api as sm
data = sm.datasets.longley.load_pandas()
X = pd.DataFrame(data.exog)
y = np.array(data.endog)
# in case you want to ignore the intercept include fit_intercept=False as a parameter
model_linear_regression = LinearRegression().fit(X, y)
model_linear_regression.summary
Result:
Coefficients Standard errors t values p values
0 -3.482259e+06 890420.379 -3.911 0.003
1 1.506190e+01 84.915 0.177 0.863
2 -3.580000e-02 0.033 -1.070 0.310
3 -2.020200e+00 0.488 -4.136 0.002
4 -1.033200e+00 0.214 -4.822 0.001
5 -5.110000e-02 0.226 -0.226 0.826
6 1.829151e+03 455.478 4.016 0.002
You can train an OLS model, making the necessary changes in case you want to add the intercept/constant, as follows:
with_intercept = True
if with_intercept:
p = len(X.columns) + 1 # plus one because an intercept is added
new_X = np.empty(shape=(len(X), p), dtype=float)
new_X[:, 0] = 1
new_X[:, 1:p] = X.values
model_statsmodel = sm.OLS(endog=y, exog=new_X).fit()
else:
model_statsmodel = sm.OLS(endog=y, exog=X).fit()
To finish, you can verify the results are the same this way:
assert np.allclose(np.round(model_statsmodel.params, 3), np.round(model_linear_regression.coefficients, 3))
assert np.allclose(np.round(model_statsmodel.bse, 3), np.round(model_linear_regression.std_errors, 3))
assert np.allclose(np.round(model_statsmodel.tvalues, 3), np.round(model_linear_regression.t_values, 3))
# the following assertion might return False, due to a difference in the number of decimals
assert np.allclose(np.round(model_statsmodel.pvalues, 3), np.round(model_linear_regression.p_values, 3))
Hope it is helpful :)
Upvotes: 2