Reputation: 65
I'm trying to write a Python code that does Aspect Based Sentiment Analysis of product reviews using Dependency Parser. I created an example review:
"The Sound Quality is great but the battery life is bad."
The output is : [['soundquality', ['great']], ['batterylife', ['bad']]]
I can properly get the aspect and it's adjective with this sentence but when I change the text to:
"The Sound Quality is not great but the battery life is not bad."
The output still stays the same. How can I add a negation handling to my code? And are there ways to improve what I currently have?
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import stanfordnlp
stanfordnlp.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
txt = "The Sound Quality is not great but the battery life is not bad."
txt = txt.lower()
sentList = nltk.sent_tokenize(txt)
taggedList = []
for line in sentList:
txt_list = nltk.word_tokenize(line) # tokenize sentence
taggedList = taggedList + nltk.pos_tag(txt_list) # perform POS-Tagging
print(taggedList)
newwordList = []
flag = 0
for i in range(0,len(taggedList)-1):
if(taggedList[i][1]=='NN' and taggedList[i+1][1]=='NN'):
newwordList.append(taggedList[i][0]+taggedList[i+1][0])
flag=1
else:
if(flag == 1):
flag=0
continue
newwordList.append(taggedList[i][0])
if(i==len(taggedList)-2):
newwordList.append(taggedList[i+1][0])
finaltxt = ' '.join(word for word in newwordList)
print(finaltxt)
stop_words = set(stopwords.words('english'))
new_txt_list = nltk.word_tokenize(finaltxt)
wordsList = [w for w in new_txt_list if not w in stop_words]
taggedList = nltk.pos_tag(wordsList)
nlp = stanfordnlp.Pipeline()
doc = nlp(finaltxt)
dep_node = []
for dep_edge in doc.sentences[0].dependencies:
dep_node.append([dep_edge[2].text, dep_edge[0].index, dep_edge[1]])
for i in range(0, len(dep_node)):
if(int(dep_node[i][1]) != 0):
dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
print(dep_node)
featureList = []
categories = []
totalfeatureList = []
for i in taggedList:
if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
featureList.append(list(i))
totalfeatureList.append(list(i)) # stores all the features for every sentence
categories.append(i[0])
print(featureList)
print(categories)
fcluster = []
for i in featureList:
filist = []
for j in dep_node:
if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
if(j[0]==i[0]):
filist.append(j[1])
else:
filist.append(j[0])
fcluster.append([i[0], filist])
print(fcluster)
finalcluster = []
dic = {}
for i in featureList:
dic[i[0]] = i[1]
for i in fcluster:
if(dic[i[0]]=='NN'):
finalcluster.append(i)
print(finalcluster)
Upvotes: 1
Views: 822
Reputation: 25249
You may wish to try spacy
. The following pattern will catch:
is
or are
not
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
output = []
doc = nlp('The product is very good')
matcher = Matcher(nlp.vocab)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"LOWER":"very","OP":"?"},{"POS":"ADJ"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
Alternatively, you may broaden matching pattern with info from dependency parser that would add definition of adjectival phrase:
output = []
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"DEP":"advmod","OP":"?"},{"DEP":"acomp"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
Upvotes: 1