Train the model first and Test multiple times

Question

I have been trying to use python's NLP script with my QT GUI based C++ application. Basically in the application I am trying to access the NLP script through command line:

QString path = "D:/DS Project/Treegramming";
QString  command("py");
QStringList params = QStringList() << "nlp.py";
params << text;
QProcess *process = new QProcess();
process->setWorkingDirectory(path);
process->start(command, params);
process->waitForFinished();
QString result = process->readAll();

The above is working perfectly. but the problem is, it is taking about 40-50 seconds to execute, as it is first training the model and then testing. But I want to train the model first and test it multiple times as we do in Jupyter Notebook. for that I made a separate function for testing and trying to access it with command line:

PS D:\DS Project\Treegramming> py nlp.py "test('it was amazing')"

but again this thing is executing the whole script first and then executing the function. is there anything I can do to solve this?

python script:

# -*- coding: utf-8 -*-
"""
Created on Fri Dec  6 16:18:01 2019

@author: Muhammad Ahmed
"""

import nltk
import sys
import random
import re,string
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );

Anwarvic · Accepted Answer

You need to create two python scripts:

First to train and save the NaiveBayesClassifier
Second to load and test the model.

To prevent repeating code, I will create a script for helpful functions and I will call it utils.py which should look like this:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)

Then let's create the training script, I will call it train.py and it should look like this:

import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)

Finally, the test script test.py that should look like this:

import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );

Now, run train.py once to train the Naive Bayes classifier that will create a new file called model.pickle that holds the trained classifier. Then run test.py from your C++ application on your custom tweet. test.py should loades the trained model model.pickle and use it on the given custom tweet.

Train the model first and Test multiple times

Answers (1)

Related Questions