Reputation: 323
The problem I am experiencing is that in my code, I am not able to get individual words/tokens to match with the stop words to remove from the original text. Instead, I am getting a whole sentence and hence am not able to match it with the stop words. Please show me a way by which I can get individual tokens and then match those with stop words and remove them. Please help me.
from nltk.corpus import stopwords
import string, os
def remove_stopwords(ifile):
processed_word_list = []
stopword = stopwords.words("urdu")
text = open(ifile, 'r').readlines()
for word in text:
print(word)
if word not in stopword:
processed_word_list.append('*')
print(processed_word_list)
return processed_word_list
if __name__ == "__main__":
print ("Input file path: ")
ifile = input()
remove_stopwords(ifile)
Upvotes: 2
Views: 2201
Reputation: 7530
Try this instead:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string, os, ast
def remove_stopwords(ifile):
processed_word_list = []
stopword = stopwords.words("urdu")
words = ast.literal_eval(open(ifile, 'r').read())
for word in words:
print(word)
if word not in stopword:
processed_word_list.append('*')
else:
processed_word_list.append(word)
print(processed_word_list)
return processed_word_list
if __name__ == "__main__":
print ("Input file path: ")
ifile = input()
remove_stopwords(ifile)
Upvotes: 2