Reputation: 52323
sklearn classifiers accept pandas' TimeStamp
(=datetime64[ns]
) as a column in X, as long as all of X columns are of that type. But when there are both TimeStamp
and float
columns, sklearn refuses to work with TimeStamp.
Is there any workaround besides converting TimeStamp into int
using astype(int
)? (I still need the original column to access dt.year
etc., so ideally would prefer not to create a duplicate column just to provide a feature to sklearn.)
import pandas as pd
from sklearn.linear_model import LinearRegression
test = pd.date_range('20000101', periods = 100)
test_df = pd.DataFrame({'date': test})
test_df['a'] = 1
test_df['y'] = 1
lr = LinearRegression()
lr.fit(test_df[['date']], test_df['y']) # works fine
lr.fit(test_df[['date', 'date']], test_df['y']) # works fine
lr.fit(test_df[['date', 'a']], test_df['y']) # complains
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-90-0605fa5bcdfa> in <module>()
----> 1 lr.fit(test_df[['date', 'a']], test_df['y'])
/home/shoya/.pyenv/versions/3.5.0/envs/study-env/lib/python3.5/site-packages/sklearn/linear_model/base.py in fit(self, X, y, sample_weight)
434 n_jobs_ = self.n_jobs
435 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 436 y_numeric=True, multi_output=True)
437
438 if ((sample_weight is not None) and np.atleast_1d(
/home/shoya/.pyenv/versions/3.5.0/envs/study-env/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
521 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
522 ensure_2d, allow_nd, ensure_min_samples,
--> 523 ensure_min_features, warn_on_dtype, estimator)
524 if multi_output:
525 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
/home/shoya/.pyenv/versions/3.5.0/envs/study-env/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
402 # make sure we acually converted to numeric:
403 if dtype_numeric and array.dtype.kind == "O":
--> 404 array = array.astype(np.float64)
405 if not allow_nd and array.ndim >= 3:
406 raise ValueError("Found array with dim %d. %s expected <= 2."
TypeError: float() argument must be a string or a number, not 'Timestamp'
Apparently, when the dtypes are mixed, and therefore the ndarray has type object
, sklearn attempts to convert them to float
, which fails with TimeStamp
. But when the dtypes are all datetime64[ns]
, sklearn just leaves things unchanged.
Upvotes: 8
Views: 3754
Reputation: 4253
you want to fit on X and y, where X are features (2 or more) and y is a target. use your datetimeindex as a time series, not a feature. In my example, I fit earthquakes with mag > 7 and calculate the elapsed days between each quake. The elapsed days and depth and latitude and longitude are fed to the linear regression classifier.
events=df[df.mag >7]
events=events.sort_index()
index=0
#dates ascending False
events['previous']=events.index
for key,item in events.iterrows():
if index>0:
events.loc[key,'previous']=events.index.values[index-1]
events.loc[key,'time_delta']=events.index.values[index]-events.index.values[index-1]
index+=1
events['elapsed_days']=events['time_delta'].apply(lambda x: np.nan_to_num(x.days))
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X=events[['latitude','longitude','elapsed_days','depth']]
y=np.nan_to_num(events['mag'])
X_train,X_test,y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=42)
lr = LinearRegression()
lr.fit(X,y)
y_pred=lr.predict(X_test)
fig, ax= plt.subplots()
ax.plot(X_test['elapsed_days'],y_pred)
plt.title('Magnitude Prediction')
plt.show()
fig, ax= plt.subplots()
ax.plot(events.index,np.nan_to_num(events['mag']))
plt.xticks(rotation=90)
plt.legend(['Magnitude'])
twin_ax=ax.twinx()
twin_ax.plot(events.index,events['elapsed_days'],color='red')
plt.legend(['Elapsed Days'],loc=1)
plt.show()
Upvotes: 0
Reputation: 1028
You can translate it to a proper integer or float
test_df['date'] = test_df['date'].astype(int)
Upvotes: 1