user3778289
user3778289

Reputation: 323

How to read tokens from a file one by one in Python?

The problem I am experiencing is that in my code, I am not able to get individual words/tokens to match with the stop words to remove from the original text. Instead, I am getting a whole sentence and hence am not able to match it with the stop words. Please show me a way by which I can get individual tokens and then match those with stop words and remove them. Please help me.

from nltk.corpus import stopwords
import string, os
def remove_stopwords(ifile):
    processed_word_list = []
    stopword = stopwords.words("urdu")
    text = open(ifile, 'r').readlines()
    for word in text:
         print(word)
         if word  not in stopword:
                processed_word_list.append('*')
                print(processed_word_list)
                return processed_word_list

if __name__ == "__main__":
    print ("Input file path: ")
    ifile = input()
    remove_stopwords(ifile)

Upvotes: 2

Views: 2201

Answers (1)

M3RS
M3RS

Reputation: 7530

Try this instead:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string, os, ast
def remove_stopwords(ifile):
    processed_word_list = []
    stopword = stopwords.words("urdu")
    words = ast.literal_eval(open(ifile, 'r').read())
    for word in words:
        print(word)
        if word not in stopword:
            processed_word_list.append('*')
        else:
            processed_word_list.append(word)
    print(processed_word_list)
    return processed_word_list

if __name__ == "__main__":
    print ("Input file path: ")
    ifile = input()
    remove_stopwords(ifile)

Upvotes: 2

Related Questions