Why am I getting an error stating X has 538 features, but SVC is expecting 1291 features as input

Question

I am getting an error message stating: X has 538 features, but SVC is expecting 1291 features as input. I am new to python, kindly help me out. I have already referred to the answer to a similar question suggesting the use of from sklearn import * but it didn't work. This is the error message that I'm getting:

this is the code:

import matplotlib.pyplot as plt
import pandas as pd
import nltk
import numpy as np
from sklearn import svm


# from sklearn.feature_extraction.text import TfidfVectorizer
data=pd.read_excel(r"C:\Users\amgup\Downloads\classification\Model_Dataset.xlsx", usecols=['Category','Title','Description'])

# Combining title and description into one column
data['combined']= (data['Title']+' '+data['Description']).str.lower()
data=data.drop(columns=['Title','Description'])
data['Category'].isnull().any()
data = data.dropna(axis = 0, how ='any')
print(data)

# shuffling the order of the rows. sample() returns random rows from the data and frac specifies what fraction has to be returned
# frac=1 means the entire data in a random order
data=data.sample(frac=1)
print(data)


y=list(data['combined'])

#lemmatization
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokens=[]
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
for x in y:
    x = str(x)
    x = x.lower()
    x = x.split()
    x = [lemmatizer.lemmatize(word) for word in x if word not in stop_words]
    x = ' '.join(x)
    tokens.append(x)

#stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed=[]
for x in tokens:
    x = str(x)
    x = x.lower()
    x = x.split()
    x = [ps.stem(word) for word in x if word not in stop_words]
    x = ' '.join(x)
    stemmed.append(x)
data['combined']=stemmed
print("Data after lemmatization and stemming:
{}".format(data))


#Splitting the data into training and testing and testing data in 80:20 ratio
from sklearn.model_selection import train_test_split
x=data['combined']# since combined is the feature we take it as x
y = data['Category']# since category is the output or label we'll take it as y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=42)
print("
Shape of x_train:
{}".format(x_train.shape))
print("
Shape of x_test:
{}".format(x_test.shape))
print("
Shape of y_train:
{}".format(y_train.shape))
print("
Shape of y_test:
{}".format(y_test.shape))

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train.values.astype('U'))
x_test = vectorizer.fit_transform(x_test.values.astype('U'))
print("
x_train:
 {} 

x_test:
 {} ".format(x_train,x_test))

# Using SVC to predict the data
clf = svm.SVC(kernel='linear', C = 1.0)
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
print(classification_report(y_test, y_predict))

Why am I getting an error stating X has 538 features, but SVC is expecting 1291 features as input

Answers (1)

Related Questions