منى
منى

Reputation: 666

SVM stuck in fitting the model

I am using SVM and I have a problem in which the execution of the program stays stuck in model.fit(X_test, y_test), which corresponds to fitting the SVM model. How to fix that? Here is my code:

# Make Predictions with Naive Bayes On The Iris Dataset
import collections
from csv import reader
from math import sqrt, exp, pi

from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals.six import StringIO
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import datasets, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# Function to split the dataset 
def splitdataset(balance_data, column_count): 

    # Separating the target variable 
    X = balance_data.values[:, 1:column_count] 
    Y = balance_data.values[:, 0] 

    # Splitting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split( 
    X, Y, test_size = 0.3, random_state = 100) 

    return X, Y, X_train, X_test, y_train, y_test 

def importdata(): 
    balance_data = pd.read_csv( 'dataExtended.txt', sep= ',') 
    row_count, column_count = balance_data.shape

    # Printing the dataswet shape 
    print ("Dataset Length: ", len(balance_data)) 
    print ("Dataset Shape: ", balance_data.shape) 
    print("Number of columns ", column_count)

    # Printing the dataset obseravtions 
    print ("Dataset: ",balance_data.head()) 
    balance_data['gold'] = balance_data['gold'].astype('category').cat.codes
    balance_data['Program'] = balance_data['Program'].astype('category').cat.codes

    return balance_data, column_count 


# Driver code 
def main(): 
    print("hey")

    # Building Phase 
    data,column_count = importdata() 
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data, column_count) 

    #Create a svm Classifier
    model = svm.SVC(kernel='linear',probability=True) # Linear Kernel

    print('before fitting')

    model.fit(X_test, y_test)
    print('fitting over')
    predicted = model.predict(X_test)   
    print('prediction over')
    print(metrics.classification_report(y_test, predicted))
    print('classification over')
    print(metrics.confusion_matrix(y_test, predicted))
    probs = model.predict_proba(X_test)
    probs_list = list(probs)
    y_pred=[None]*len(y_test)
    y_pred_list = list(y_pred)
    y_test_list = list(y_test)
    i=0
    threshold=0.7
    while i<len(probs_list):
            #print('probs ',probs_list[i][0])
            if (probs_list[i][0]>=threshold) & (probs_list[i][1]<threshold):
                   y_pred_list[i]=0
                   i=i+1

            elif (probs_list[i][0]<threshold) & (probs_list[i][1]>=threshold):
                   y_pred_list[i]=1
                   i=i+1
            else: 
                   #print(y_pred[i])
                   #print('i==> ',i, ' probs length ', len(probs_list), ' ', len(y_pred_list), ' ', len(y_test_list))
                   y_pred_list.pop(i)
                   y_test_list.pop(i)
                   probs_list.pop(i)




    #print(y_pred_list)
    print('confusion matrix\n',confusion_matrix(y_test_list,y_pred_list))
    print('classification report\n', classification_report(y_test_list,y_pred_list))
    print('accuracy score', accuracy_score(y_test_list, y_pred_list))

    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_list, y_pred_list))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test_list, y_pred_list))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_list, y_pred_list)))

if __name__=="__main__": 
    main() 

Upvotes: 2

Views: 2242

Answers (1)

mac13k
mac13k

Reputation: 2663

This is most likely due to the parameter probability set to True when you initialize the model. As you can read in the docs:

probability: bool, default=False

Whether to enable probability estimates. This must be enabled prior to calling fit, will slow down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with predict.

This issue has been discussed on StackOverflow here and here.

Upvotes: 1

Related Questions