How to upload the result of Naive Bayes Classifier into a table?

Recently I've started a Python sentiment analysis training course, wherein I've got the first task, that is to process a dataset of IMDB reviews through 2 different models: lexical analysis with sentiment-pipeline (from transformers) and Naive Bayes Classifier. My task is to create a table for comparison which consist of three columns: review, sentiment from 1st model, sentiment from 2nd model. If it comes to first model, everything is clear, because pipeline returns a list, which could be easily converted to csv, whereas NBC really get me stuck. I have done all the steps to clear the dataset, train the model and perform it, but I have totally no idea how to show the result of its work as a table, don't know what to do the next after the last lign of the code below. I've read documentation, but haven't found anything related to the that. In the example which I was given (here), there's only an estimation of its output, whereas I need the output itself. Excuse me, if this question is too simple to ask, but I really need help, because the course support is not quite well.

The code is below.


import nltk
import re
import string
from nltk.corpus import stopwords

from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')

def cleaning(text):
    # converting to lowercase, removing URL links, special characters, punctuations...
    text = text.lower()  # converting to lowercase
    text = re.sub('https?://\S+|www\.\S+', '', text)  # removing URL links
    text = re.sub(r"\b\d+\b", "", text)  # removing number
    text = re.sub('<.*?>+', '', text)  # removing special characters,
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # punctuations
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)

    # removing emoji:
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # removing short form:

    text = re.sub("isn't", 'is not', text)
    text = re.sub("he's", 'he is', text)
    text = re.sub("wasn't", 'was not', text)
    text = re.sub("there's", 'there is', text)
    text = re.sub("couldn't", 'could not', text)
    text = re.sub("won't", 'will not', text)
    text = re.sub("they're", 'they are', text)
    text = re.sub("she's", 'she is', text)
    text = re.sub("There's", 'there is', text)
    text = re.sub("wouldn't", 'would not', text)
    text = re.sub("haven't", 'have not', text)
    text = re.sub("That's", 'That is', text)
    text = re.sub("you've", 'you have', text)
    text = re.sub("He's", 'He is', text)
    text = re.sub("what's", 'what is', text)
    text = re.sub("weren't", 'were not', text)
    text = re.sub("we're", 'we are', text)
    text = re.sub("hasn't", 'has not', text)
    text = re.sub("you'd", 'you would', text)
    text = re.sub("shouldn't", 'should not', text)
    text = re.sub("let's", 'let us', text)
    text = re.sub("they've", 'they have', text)
    text = re.sub("You'll", 'You will', text)
    text = re.sub("i'm", 'i am', text)
    text = re.sub("we've", 'we have', text)
    text = re.sub("it's", 'it is', text)
    text = re.sub("don't", 'do not', text)
    text = re.sub("that´s", 'that is', text)
    text = re.sub("I´m", 'I am', text)
    text = re.sub("it’s", 'it is', text)
    text = re.sub("she´s", 'she is', text)
    text = re.sub("he’s'", 'he is', text)
    text = re.sub('I’m', 'I am', text)
    text = re.sub('I’d', 'I did', text)
    text = re.sub("he’s'", 'he is', text)
    text = re.sub('there’s', 'there is', text)

    return text


df = pd.read_csv('IMDB Dataset.csv', delimiter=',', nrows=5)

df.rename(columns={'review':'text'}, inplace = True)
dt = df['text'].apply(cleaning)

# data = []
# for i in dt:
#     data.append(i)
#
# from transformers import pipeline

# sentiment_pipeline = pipeline("sentiment-analysis")
# sent = sentiment_pipeline(data)
# sent = pd.DataFrame(sent)
# sent.drop('score', axis=1, inplace=True)
# sent.rename(columns={'label':'sentiment'}, inplace = True)
# tab = df
# tab['sentiment'] = sent['sentiment']

dt = pd.DataFrame(dt)
dt['sentiment']=df['sentiment']

dt['no_sw'] = dt['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

from collections import Counter
cnt = Counter()
for text in dt["no_sw"].values:
    for word in text.split():
        cnt[word] += 1
cnt.most_common(10)
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])
dt["wo_stopfreq"] = dt["no_sw"].apply(lambda text: remove_freqwords(text))

wordnet_lem = WordNetLemmatizer()

dt['wo_stopfreq_lem'] = dt['wo_stopfreq'].apply(wordnet_lem.lemmatize)

nb=dt.drop(columns=['text','no_sw', 'wo_stopfreq'])
nb.columns=['sentiment','review']
nb.sentiment = [0 if each == "negative" else 1 for each in nb.sentiment]

tokenized_review=nb['review'].apply(lambda x: x.split())

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(nb['review'])

from sklearn.model_selection import train_test_split
X=text_counts
y=nb['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=30)

from sklearn.naive_bayes import ComplementNB
CNB = ComplementNB()
CNB.fit(X_train, y_train)```


Upvotes: -1

Views: 54

Answers (1)

Gurdev Singh
Gurdev Singh

Reputation: 2165

At this stage your machine learning model based on NaiveBayes alogrithm is ready for performing sentimental analysis on the input given to this model.

from sklearn import metrics
predicted = CNB.predict(X_test)
// predicted will have the prediction output based on sentimental analysis
accuracy_score = metrics.accuracy_score(predicted, y_test)

Upvotes: 0

Related Questions