Reputation: 3895
I want to make regression model with Scikit learn. I have features that are categorical and numerical. This is how I handled that.
features = df[["text", "title_len", "lead_len", "exclamation_question", "number_of_ent", "punct_count"]]
results = df["shares"]
features = features.to_numpy()
results = results.to_numpy()
print("Shape of Features:", features.shape) # Shape of Features: (14706, 6)
print("Shape of Result:", results.shape) # Shape of Result: (14706,)
# Creating vectorizer
transformerVectoriser = ColumnTransformer(transformers=[('text_vocab', TfidfVectorizer(analyzer='word', ngram_range=(1, 4), vocabulary=vocabulary, lowercase = True), 0)
],
remainder='passthrough'
)
# Making final prediction with classifiation report and confusion matrix with model with highest accuracy
x_train, x_test, y_train, y_test = train_test_split(features, results, test_size=0.25, random_state=0)
print("X Train Shape", x_train.shape) # X Train Shape (11029, 6)
print("Y Train Shape", y_train.shape) # Y Train Shape (11029,)
print("X Test Shape", x_test.shape) # X Test Shape (3677, 6)
print("Y Test Shape", y_test.shape) # Y Test Shape (3677,)
x_train = transformerVectoriser.fit_transform(x_train)
x_test = transformerVectoriser.transform(x_test)
print("X Train Vectorized Shape", x_train.shape) # X Train Vectorized Shape (11029, 1091)
print("X Test Vectorized Shape", x_test.shape) # X Test Vectorized Shape (3677, 1091)
This is how I created a model:
regression_models = [["SVR C1", SVR(kernel='rbf', gamma='scale', C=1.0)],
["SVR C2", SVR(kernel='rbf', gamma='scale', C=2.0)],
["SVR C5", SVR(kernel='rbf', gamma='scale', C=5.0)]]
for regressor in regression_models:
name = regressor[0]
regressor = regressor[1]
print("Model Name:", name)
model = regressor.fit(x_train, y_train)
y_pred = model.predict(x_test)
score = model.score(y_test, y_pred) # ERROR
print(score)
The error that I get:
ValueError Traceback (most recent call last)
<ipython-input-1351-c5fbe26b2474> in <module>
22 print(y_test)
23 print(y_test.shape)
---> 24 score = model.score(y_test, y_pred)
25 print(score)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/base.py in score(self, X, y, sample_weight)
551
552 from .metrics import r2_score
--> 553 y_pred = self.predict(X)
554 return r2_score(y, y_pred, sample_weight=sample_weight)
555
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/linear_model/_base.py in predict(self, X)
236 Returns predicted values.
237 """
--> 238 return self._decision_function(X)
239
240 _preprocess_data = staticmethod(_preprocess_data)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/linear_model/_base.py in _decision_function(self, X)
218 check_is_fitted(self)
219
--> 220 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
221 return safe_sparse_dot(X, self.coef_.T,
222 dense_output=True) + self.intercept_
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
692 # If input is 1D raise error
693 if array.ndim == 1:
--> 694 raise ValueError(
695 "Expected 2D array, got 1D array instead:\narray={}.\n"
696 "Reshape your data either using array.reshape(-1, 1) if "
ValueError: Expected 2D array, got 1D array instead:
array=[13. 8. 71. ... 43. 61. 55.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
What am I doing wrong?
If i print:
print(y_pred)
print(y_pred.shape)
print(y_test)
print(y_test.shape)
Im getting this:
[ 81.54398324 43.34783895 111.73805915 ... 75.27910881 89.46342907
78.93812588]
(4235,)
[13. 8. 71. ... 43. 61. 55.]
(4235,)
Upvotes: 0
Views: 503
Reputation: 1176
Based on documentation, the input of score
is X and y. Therefore, it should be changed to
score = model.score(x_test, y_test)
Else, you can do
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
Upvotes: 1