Reputation: 1803
I am attempting to build a function which will pull data for any stock and then plot a regression. However, I am running into issues with the source data. My question is - how do I take a time series in a pandas data frame and plot the linear trend over time? My code below:
This code will produce the regression:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
rng = np.random.RandomState(1)
x = 10 * rng.rand(50)
y = 2 * x - 5 + rng.randn(50)
plt.scatter(x, y);
plt.show()
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(x[:, np.newaxis], y)
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])
plt.scatter(x, y)
plt.plot(xfit, yfit);
plt.show()
This is my attempt to pass the data via a dataframe
from datetime import datetime
import pandas_datareader.data as web
start = datetime(2017, 8, 1)
end = datetime(2018, 7, 30)
data_SP = web.DataReader('JPM', 'iex', start, end)
y = dates # not sure how to get here?
plt.scatter(data['close'], y);
plt.show()
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(data['close'][:, np.newaxis], y)
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])
plt.scatter(data['close'], y)
plt.plot(xfit, yfit);
plt.show()
Upvotes: 1
Views: 755
Reputation: 1803
I've adjusted the code to the following. It will produce visuals showing returns in excess of a benchmark. There are a lot of directions to go with the code. For example - we could have it cycle all 500 stocks in the S&P and figure out the best-returning stocks against the index or we could have it cycle all 500 stocks by 1 month periods and figure out based on history what is the best stock to hold when. The visuals are a nice touch to the analysis.
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import pandas
from sklearn.linear_model import LinearRegression
import pandas_datareader.data as pdr
def close_price_trending(analysis):
model = LinearRegression(fit_intercept=True)
model.fit(np.array(days_since)[:, np.newaxis],data_sample_processed[analysis])
yfit = model.predict(np.array(days_since)[:, np.newaxis])
plt.scatter(dates, data_sample_processed[analysis])
plt.scatter(dates, yfit)
plt.xlabel('date')
plt.ylabel('close')
plt.show()
def return_excess_benchmark1(analysis, benchmark):
fig = plt.figure()
ax = fig.add_subplot(111)
fig.subplots_adjust(top=0.85)
ax.set_title(str(analysis) + ' O/U ' + str(benchmark))
plt.scatter(dates, (1 + data_sample_processed[analysis]).cumprod() - (1 + data_sample_processed[benchmark]).cumprod())
model = LinearRegression(fit_intercept=True)
model.fit(np.array(days_since)[:, np.newaxis],(1 + data_sample_processed[analysis]).cumprod() - (1 + data_sample_processed[benchmark]).cumprod())
yfit = model.predict(np.array(days_since)[:, np.newaxis])
plt.scatter(dates, yfit)
plt.xlabel('date')
plt.ylabel('close')
fig.show()
# get and process data
start = datetime(2015, 8, 1)
end = datetime(2018, 7, 30)
Symbol_List = ['GSLC', 'AGG', 'JPM','CAR', 'IVV', 'DSI', 'VTI']
data = pandas.concat([pdr.DataReader(s, 'iex', start, end).rename(columns={'close': s})
for s in Symbol_List], axis=1)
data_sample = data[Symbol_List]
data_sample_processed = data_sample.pct_change()
data_sample_processed = data_sample_processed.fillna(0)
dates = list(map(lambda x: datetime.strptime(x,"%Y-%m-%d"),list(data_sample_processed.index)))
days_since = list(map(lambda x: (x-start).days,dates))
# start analysis
analysis_symbol_1 = 'DSI'
analysis_symbol_2 = 'GSLC'
benchmark_1 = 'VTI'
return_excess_benchmark1(analysis_symbol_1, benchmark_1)
return_excess_benchmark1(analysis_symbol_2, benchmark_1)
Upvotes: 0
Reputation: 146
Regression can't take datetime objects, must convert to number type:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from datetime import datetime
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like
from sklearn.linear_model import LinearRegression
import pandas_datareader.data as web
start = datetime(2017, 8, 1)
end = datetime(2018, 7, 30)
data_SP = web.DataReader('JPM', 'iex', start, end)
dates = list(map(lambda x: datetime.strptime(x,"%Y-%m-%d"),list(data_SP.index)))
days_since = list(map(lambda x: (x-start).days,dates))
model = LinearRegression(fit_intercept=True)
model.fit(np.array(days_since)[:, np.newaxis],data_SP['close'])
yfit = model.predict(np.array(days_since)[:, np.newaxis])
plt.figure()
plt.scatter(dates, yfit)
plt.scatter(dates, data_SP['close'])
plt.xlabel('date')
plt.ylabel('close')
plt.show()
If percentage change is used, then a nagging NaN needs to be accounted for.
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from datetime import datetime
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like
from sklearn.linear_model import LinearRegression
import pandas_datareader.data as web
start = datetime(2017, 8, 1)
end = datetime(2018, 7, 30)
data_SP = web.DataReader('JPM', 'iex', start, end)
dates = list(map(lambda x: datetime.strptime(x,"%Y-%m-%d"),list(data_SP.index)))
days_since = list(map(lambda x: (x-start).days,dates))
model = LinearRegression(fit_intercept=True)
model.fit(np.array(days_since)[1:][:, np.newaxis],data_SP['close'].pct_change(1)[1:]) # <------------
yfit = model.predict(np.array(days_since)[:, np.newaxis])
plt.figure()
plt.scatter(dates, yfit)
plt.scatter(dates, data_SP['close'].pct_change(1))
plt.xlabel('date')
plt.ylabel('close')
plt.show()
Upvotes: 1
Reputation: 620
What I assume you are asking is to be able to plot your stock data over time. Like I suggested in the comments, your x-axis should be the dates and y-axis should be the closing price.
From there, we will simply plot the graph:
plt.scatter(data_SP.index,data_SP['close'])
I had a few other problems with the imports in your code so in case you get those problems, I will post the full code I used here:
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
pd.core.common.is_list_like = pd.api.types.is_list_like
from pandas_datareader import data, wb
start = datetime(2017, 8, 1)
end = datetime(2018, 7, 30)
data_SP = data.DataReader('JPM', 'iex', start, end)
plt.scatter(data_SP.index,data_SP['close'])
You will have to reformat the x-axis to be able to see the dates and if you want to make any other changes. And if you want to use a regression model, you will have to use numerical data, not datetime data. (I'll post a link for you)
Linear regression doesn't work on date data. Therefore we need to convert it into numerical value.The following code will convert the date into numerical value:
import datetime as dt
data_df['Date'] = pd.to_datetime(data_df['Date'])
data_df['Date']=data_df['Date'].map(dt.datetime.toordinal)
This is the first answer from the link (Full credit to Chandan)
Upvotes: 1