Python sort out columns in DataFrame for OLS regression

Question

I have a csv file with the following columns:

I am trying to run a multiple OLS regression in python, regressing 'Mkt-RF', 'SMB' and 'HML' on 'aig-RF' for instance.

It seems like i need to first sort out the DataFrame from the arrays but i cannot seem to understand how:

# Regression

x = df[['Mkt-RF','SMB','HML']] y = df['aig-RF'] df = pd.DataFrame({'x':x, 'y':y}) df['constant'] = 1 df.head() sm.OLS(y,df[['constant','x']]).fit().summary()

The full code is:

import numpy as np import pandas as pd from pandas import DataFrame from sklearn import linear_model import statsmodels.api as sm

def ReadFF(sIn): """ Purpose: Read the FF data

Inputs: sIn string, name of input file Return value: df dataframe, data """ df= pd.read_csv(sIn, header=3, names=["Date","Mkt-RF","SMB","HML","RF"]) df= df.dropna(how='any') # Reformat the dates, as date-time, and place them as index vDate= pd.to_datetime(df["Date"].values,format='%Y%m%d') df.index= vDate # Add in a constant iN= len(vDate) df["C"]= np.ones(iN) print(df) return df

def JoinStock(df, sStock, sPer): """ Purpose: Join the stock into the dataframe, as excess returns

Inputs: df dataframe, data including RF sStock string, name of stock to read sPer string, extension indicating period Return value: df dataframe, enlarged """ df1= pd.read_csv(sStock+"_"+sPer+".csv", index_col="Date", usecols=["Date", "Adj Close"]) df1.columns= [sStock] # Add prices to original dataframe, to get correct dates df= df.join(df1, how="left") # Extract returns vR= 100*np.diff(np.log(df[sStock].values)) # Add a missing, as one observation was lost differencing vR= np.hstack([np.nan, vR]) # Add excess return to dataframe df[sStock + "-RF"]= vR - df["RF"] print(df) return df

def SaveFF(df, asStock, sOut): """ Purpose: Save data for FF regressions

Inputs: df dataframe, all data asStock list of strings, stocks sOut string, output file name Output: file written to disk """ df= df.dropna(how='any') asOut= ['Mkt-RF', 'SMB', 'HML', 'RF', 'C'] for sStock in asStock: asOut.append(sStock+"-RF") print ("Writing columns ", asOut, "to file ", sOut) df.to_csv(sOut, columns=asOut, index_label="Date", float_format="%.8g") print(df) return df

def main():

sPer= "0018" sIn= "Research_Data_Factors_weekly.csv" sOut= "ffstocks" asStock= ["aig", "ford", "ibm", "xom"] # Initialisation df= ReadFF(sIn) for sStock in asStock: df= JoinStock(df, sStock, sPer) # Output SaveFF(df, asStock, sOut+"_"+sPer+".csv") print ("Done") # Regression x = df[['Mkt-RF','SMB','HML']] y = df['aig-RF'] df = pd.DataFrame({'x':x, 'y':y}) df['constant'] = 1 df.head() sm.OLS(y,df[['constant','x']]).fit().summary()

What exactly do i need to modify in pd.DataFrame in order to get the multiple OLS regression table?

gyoza · Accepted Answer

I propose to change the first chunk of your code to below (mostly just swapping line orders):

# add constant column to the original dataframe
df['constant'] = 1

# define x as a subset of original dataframe
x = df[['Mkt-RF', 'SMB', 'HML', 'constant']]

# define y as a series
y = df['aig-RF']

# pass x as a dataframe, while pass y as a series
sm.OLS(y, x).fit().summary()

Hope this helps.

Python sort out columns in DataFrame for OLS regression

Answers (1)

Related Questions