Reputation: 666
I am using SVM and I have a problem in which the execution of the program stays stuck in model.fit(X_test, y_test)
, which corresponds to fitting the SVM model. How to fix that? Here is my code:
# Make Predictions with Naive Bayes On The Iris Dataset
import collections
from csv import reader
from math import sqrt, exp, pi
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals.six import StringIO
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import datasets, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# Function to split the dataset
def splitdataset(balance_data, column_count):
# Separating the target variable
X = balance_data.values[:, 1:column_count]
Y = balance_data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
def importdata():
balance_data = pd.read_csv( 'dataExtended.txt', sep= ',')
row_count, column_count = balance_data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",balance_data.head())
balance_data['gold'] = balance_data['gold'].astype('category').cat.codes
balance_data['Program'] = balance_data['Program'].astype('category').cat.codes
return balance_data, column_count
# Driver code
def main():
print("hey")
# Building Phase
data,column_count = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data, column_count)
#Create a svm Classifier
model = svm.SVC(kernel='linear',probability=True) # Linear Kernel
print('before fitting')
model.fit(X_test, y_test)
print('fitting over')
predicted = model.predict(X_test)
print('prediction over')
print(metrics.classification_report(y_test, predicted))
print('classification over')
print(metrics.confusion_matrix(y_test, predicted))
probs = model.predict_proba(X_test)
probs_list = list(probs)
y_pred=[None]*len(y_test)
y_pred_list = list(y_pred)
y_test_list = list(y_test)
i=0
threshold=0.7
while i<len(probs_list):
#print('probs ',probs_list[i][0])
if (probs_list[i][0]>=threshold) & (probs_list[i][1]<threshold):
y_pred_list[i]=0
i=i+1
elif (probs_list[i][0]<threshold) & (probs_list[i][1]>=threshold):
y_pred_list[i]=1
i=i+1
else:
#print(y_pred[i])
#print('i==> ',i, ' probs length ', len(probs_list), ' ', len(y_pred_list), ' ', len(y_test_list))
y_pred_list.pop(i)
y_test_list.pop(i)
probs_list.pop(i)
#print(y_pred_list)
print('confusion matrix\n',confusion_matrix(y_test_list,y_pred_list))
print('classification report\n', classification_report(y_test_list,y_pred_list))
print('accuracy score', accuracy_score(y_test_list, y_pred_list))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_list, y_pred_list))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_list, y_pred_list))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_list, y_pred_list)))
if __name__=="__main__":
main()
Upvotes: 2
Views: 2242
Reputation: 2663
This is most likely due to the parameter probability
set to True
when you initialize the model. As you can read in the docs:
probability: bool, default=False
Whether to enable probability estimates. This must be enabled prior to calling fit, will slow down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with predict.
This issue has been discussed on StackOverflow here and here.
Upvotes: 1