Reputation: 117
I'm new to machine learning and I'm doing my "hello world" using sklearn and nltk, but I have problems with the result of the prediction, it always throws me a single value.
I am following a tutorial that I obtained, that has errors and I have been modifying it little by little until in the end it gave me the result, but it is not the expected one.
Attach the tutorial link: https://towardsdatascience.com/text-classification-using-k-nearest-neighbors-46fa8a77acc5
I attach my current code: (always show: "Conditions" as final result)
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score
from nltk.stem.wordnet import WordNetLemmatizer
class KNN_NLC_Classifer():
def __init__(self, k=1, distance_type = 'path'):
self.k = k
self.distance_type = distance_type
# This function is used for training
def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
# This function runs the K(1) nearest neighbour algorithm and
# returns the label with closest match.
def predict(self, x_test):
self.x_test = x_test
y_predict = []
for i in range(len(x_test)):
max_sim = 0
max_index = 0
for j in range(self.x_train.shape[0]):
temp = self.document_similarity(x_test[i], self.x_train[j])
if temp > max_sim:
max_sim = temp
max_index = j
y_predict.append(self.y_train[max_index])
return y_predict
def convert_tag(self, tag):
"""Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
try:
return tag_dict[tag[0]]
except KeyError:
return None
def doc_to_synsets(self, doc):
"""
Returns a list of synsets in document.
Tokenizes and tags the words in the document doc.
Then finds the first synset for each word/tag combination.
If a synset is not found for that combination it is skipped.
Args:
doc: string to be converted
Returns:
list of synsets
"""
tokens = word_tokenize(str(doc)+' ')
l = []
tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
for token, tag in zip(tokens, tags):
syntag = self.convert_tag(tag[1])
syns = wn.synsets(token, syntag)
if (len(syns) > 0):
l.append(syns[0])
return l
def similarity_score(self, s1, s2, distance_type = 'path'):
"""
Calculate the normalized similarity score of s1 onto s2
For each synset in s1, finds the synset in s2 with the largest similarity value.
Sum of all of the largest similarity values and normalize this value by dividing it by the
number of largest similarity values found.
Args:
s1, s2: list of synsets from doc_to_synsets
Returns:
normalized similarity score of s1 onto s2
"""
s1_largest_scores = []
for i, s1_synset in enumerate(s1, 0):
max_score = 0
for s2_synset in s2:
if distance_type == 'path':
score = s1_synset.path_similarity(s2_synset, simulate_root = False)
else:
score = s1_synset.wup_similarity(s2_synset)
if score != None:
if score > max_score:
max_score = score
if max_score != 0:
s1_largest_scores.append(max_score)
mean_score = np.mean(s1_largest_scores)
return mean_score
def document_similarity(self,doc1, doc2):
"""Finds the symmetrical similarity between doc1 and doc2"""
synsets1 = self.doc_to_synsets(doc1)
synsets2 = self.doc_to_synsets(doc2)
return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2
#doc1 = 'I like rains'
#doc2 = 'I like showers'
#x = KNN_NLC_Classifer()
#print("Test Similarity Score: ", x.document_similarity(doc1, doc2))
# 1. Importing the dataset
#we'll use the demo dataset available at Watson NLC Classifier Demo.
FILENAME = "https://raw.githubusercontent.com/watson-developer-cloud/natural-language-classifier-nodejs/master/training/weather_data_train.csv"
dataset = pd.read_csv(FILENAME, header = None)
dataset.rename(columns = {0:'text', 1:'answer'}, inplace = True)
dataset['output'] = np.where(dataset['answer'] == 'temperature', 1,0)
Num_Words = dataset.shape[0]
print(dataset.head())
print("\nSize of input file is ", dataset.shape)
array = dataset.values
X = array[:,2]
Y = array[:,0]
validation_size = 0.20
seed = 7
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
x_train,y_train = X,Y
classifier.fit(x_train, y_train)
final_test_list = ['will it rain', 'Is it hot outside?' , 'What is the expected high for today?' ,
'Will it be foggy tomorrow?', 'Should I prepare for sleet?',
'Will there be a storm today?', 'do we need to take umbrella today',
'will it be wet tomorrow', 'is it humid tomorrow', 'what is the precipitation today',
'is it freezing outside', 'is it cool outside', "are there strong winds outside",]
test_corpus = []
lmtzr = WordNetLemmatizer()
#ps = PorterStemmer()
for i in range(len(final_test_list)):
review = re.sub('[^a-zA-Z]', ' ', final_test_list[i])
review = review.lower()
review = review.split()
review = [lmtzr.lemmatize(word) for word in review] # if not word in s
review = ' '.join(review)
test_corpus.append(review)
y_pred_final = classifier.predict(test_corpus)
output_df = pd.DataFrame(data = {'text': final_test_list, 'code': y_pred_final})
output_df['answer'] = np.where(output_df['code']==1, 'Temperature','Conditions')
print(output_df)
Upvotes: 0
Views: 429
Reputation: 383
In the tutorial, the method similarity_score()
tries to find the highest similarity for each synset in s1 and average them. However, it doesn't count the words in s1 that couldn't find any synset in s2 into account. It makes more sense to me if we add zeros into s1_largest_scores
for those occasions.
Take two sentences "Will it be uncomfortably hot?"
and "will it rain"
for example. The method in the tutorial will give you 1 for similarity while the method that I purposed will give you 0.53 for similarity. The sentences are in different categories so we'd like the similarity to be low.
Here is my code:
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score
from nltk.stem.wordnet import WordNetLemmatizer
class KNN_NLC_Classifer():
def __init__(self, k=1, distance_type = 'path'):
self.k = k
self.distance_type = distance_type
# This function is used for training
def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
# This function runs the K(1) nearest neighbour algorithm and
# returns the label with closest match.
def predict(self, x_test):
self.x_test = x_test
y_predict = []
for i in range(len(x_test)):
max_sim = 0
max_index = 0
for j in range(self.x_train.shape[0]):
temp = self.document_similarity(x_test[i], self.x_train[j])
if temp > max_sim:
max_sim = temp
max_index = j
y_predict.append(self.y_train[max_index])
return y_predict
def convert_tag(self, tag):
"""Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
try:
return tag_dict[tag[0]]
except KeyError:
return None
def doc_to_synsets(self, doc):
"""
Returns a list of synsets in document.
Tokenizes and tags the words in the document doc.
Then finds the first synset for each word/tag combination.
If a synset is not found for that combination it is skipped.
Args:
doc: string to be converted
Returns:
list of synsets
"""
tokens = word_tokenize(str(doc)+' ')
l = []
tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
for token, tag in zip(tokens, tags):
syntag = self.convert_tag(tag[1])
syns = wn.synsets(token, syntag)
if (len(syns) > 0):
l.append(syns[0])
return l
def similarity_score(self, s1, s2, distance_type = 'path'):
"""
Calculate the normalized similarity score of s1 onto s2
For each synset in s1, finds the synset in s2 with the largest similarity value.
Sum of all of the largest similarity values and normalize this value by dividing it by the
number of largest similarity values found.
Args:
s1, s2: list of synsets from doc_to_synsets
Returns:
normalized similarity score of s1 onto s2
"""
s1_largest_scores = []
for i, s1_synset in enumerate(s1):
max_score = 0
for s2_synset in s2:
if distance_type == 'path':
score = s1_synset.path_similarity(s2_synset, simulate_root=False)
else:
score = s1_synset.wup_similarity(s2_synset)
if score != None and score > max_score:
max_score = score
# if max_score != 0:
s1_largest_scores.append(max_score)
mean_score = np.mean(s1_largest_scores)
return mean_score
def document_similarity(self,doc1, doc2):
"""Finds the symmetrical similarity between doc1 and doc2"""
synsets1 = self.doc_to_synsets(doc1)
synsets2 = self.doc_to_synsets(doc2)
return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2
# 1. Importing the dataset
#we'll use the demo dataset available at Watson NLC Classifier Demo.
FILENAME = "https://raw.githubusercontent.com/watson-developer-cloud/natural-language-classifier-nodejs/master/training/weather_data_train.csv"
dataset = pd.read_csv(FILENAME, header = None)
dataset.rename(columns = {0:'text', 1:'answer'}, inplace = True)
dataset['output'] = np.where(dataset['answer'] == 'temperature', 1,0)
Num_Words = dataset.shape[0]
print(dataset)
print("\nSize of input file is ", dataset.shape)
array = dataset.values
X = array[:,0]
Y = array[:,2]
validation_size = 0.20
seed = 7
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
x_train,y_train = X, Y
classifier.fit(x_train, y_train)
final_test_list = [
'will it rain',
'Is it hot outside?',
'What is the expected high for today?',
'Will it be foggy tomorrow?',
'Should I prepare for sleet?',
'Will there be a storm today?',
'do we need to take umbrella today',
'will it be wet tomorrow',
'is it humid tomorrow',
'what is the precipitation today',
'is it freezing outside',
'is it cool outside',
'are there strong winds outside',
]
test_corpus = []
lmtzr = WordNetLemmatizer()
#ps = PorterStemmer()
for i in range(len(final_test_list)):
review = re.sub('[^a-zA-Z]', ' ', final_test_list[i])
review = review.lower()
review = review.split()
review = [lmtzr.lemmatize(word) for word in review] # if not word in s
review = ' '.join(review)
test_corpus.append(review)
y_pred_final = classifier.predict(test_corpus)
output_df = pd.DataFrame(data = {'text': final_test_list, 'code': y_pred_final})
output_df['answer'] = np.where(output_df['code']==1, 'Temperature', 'Conditions')
print(output_df)
And here is the result which I consider more reasonable:
text code answer
0 will it rain 0 Conditions
1 Is it hot outside? 1 Temperature
2 What is the expected high for today? 1 Temperature
3 Will it be foggy tomorrow? 1 Temperature
4 Should I prepare for sleet? 0 Conditions
5 Will there be a storm today? 1 Temperature
6 do we need to take umbrella today 0 Conditions
7 will it be wet tomorrow 1 Temperature
8 is it humid tomorrow 1 Temperature
9 what is the precipitation today 1 Temperature
10 is it freezing outside 1 Temperature
11 is it cool outside 1 Temperature
12 are there strong winds outside 0 Conditions
Upvotes: 2
Reputation: 383
After printing out x_train
and y_train
, you'll figure out the bug.
For some reason, your Y
is the feature while your X
is your label. If you changed the line x_train, y_train = X, Y
to x_train, y_train = Y, X
, it would work.
Upvotes: 1