Reputation: 181
I have successfully trained and tested a Support Vector classifier model to classify each row based on title and abstract, by using two user defined function (UDF). First UDF is for preprocessing of the data and second UDF is for the model building. To create the model i had used df1 which was already classified before
I am stuck on how to implement this trained model to new set of dataframe say df2 which is not classified. Any suggestion or help would be welcome.
See the user defined function for preprocessing and model building below
def preprocessing(col,h_pct=1,l_pct=1):
#Lower case
lower = col.apply(str.lower)
#Stemming
from nltk.stem import SnowballStemmer
stem = SnowballStemmer('english')
stemmed = lower.apply(lambda x: ' '.join(stem.stem(word) for word in str(x).split()))
#removing punctuation
import re
rem_punc = stemmed.apply(lambda x: re.sub(r'[^\w\s]',' ',x))
#removing stopwords and extra spaces
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
rem_stopwords = rem_punc.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
#removing numbers
rem_num = rem_stopwords.apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
#remove words having length=1
rem_lngth1 = rem_num.apply(lambda x: re.sub(r'[^\w\s]',' ',x))
if h_pct != 0:
#removing the top $h_pct of the most frequent words
high_freq = pd.Series(' '.join(rem_lngth1).split()).value_counts()[:int(pd.Series(' '.join(rem_lngth1).split()).count()*h_pct/100)]
rem_high = rem_lngth1.apply(lambda x: " ".join(x for x in x.split() if x not in high_freq))
else:
rem_high = rem_lngth1
if l_pct != 0:
#removing the top $l_pct of the least frequent words
low_freq = pd.Series(' '.join(rem_high).split()).value_counts()[:-int(pd.Series(' '.join(rem_high).split()).count()*l_pct/100):-1]
rem_low = rem_high.apply(lambda x: " ".join(x for x in x.split() if x not in low_freq))
else:
rem_low = rem_high
return rem_low
def prep_fit_pred(df, h_pct, l_pct, model, verbose=False):
df['new_Abstract'] = preprocessing(df['Abstract'],h_pct,l_pct)
df['concat'] = df['Title'] + '\n' + df['new_Abstract']
#not removing high and low frequency words from headline
#this is because the headline carries more significance in determining the classification of the news
df['concat_processed'] = preprocessing(df['concat'],0,0)
X = df['concat_processed']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,
stratify=y)
bow_xtrain = bow.fit_transform(X_train)
bow_xtest = bow.transform(X_test)
model.fit(bow_xtrain,y_train)
preds = model.predict(bow_xtest)
acc = accuracy_score(y_test,preds)*100
return preds, acc, model
Upvotes: 0
Views: 41
Reputation: 770
In order to use a trained model (I assume you are using sklearn).
You should preprocess the unlabeled data as you did for the training data.
And then use bow
and model
to transform and predict the same way you did for the test data. To merge bow and model into a single object you can look at Pipeline.
It would something like this:
def prep_fit_pred(df, h_pct, l_pct, bow, model, verbose=False):
df['new_Abstract'] = preprocessing(df['Abstract'],h_pct,l_pct)
df['concat'] = df['Title'] + '\n' + df['new_Abstract']
#not removing high and low frequency words from headline
#this is because the headline carries more significance in determining the classification of the news
df['concat_processed'] = preprocessing(df['concat'],0,0)
X = df['concat_processed']
bow_x = bow.transform(X)
preds = model.predict(bow_x)
return preds
Upvotes: 1