Remove Stopwords from List and Read to TXT with NLTK

Question

y'all. I've been trying to remove stopwords from a list that a pdf has been read to, but whenever I use nltk to remove those stopwords from the list or from a new list, it returns the original list back to me in the TXT file. I have made a separate program just to test if the stopwords function even works, and it works fine there but for some reason not in this case.

Is there also a better method to do this? Any help would be much appreciated.

import PyPDF2 as pdf

import nltk
from nltk.corpus import stopwords

stopping_words = set(stopwords.words('english'))

stop_words = list(stopping_words)

# creating an object 
file = open("C:\Users\Name\Documents\Data Analytics Club\SampleBook-English2-Reading.pdf", "rb")

# creating a pdf reader object
fileReader = pdf.PdfFileReader(file)

# print the number of pages in pdf file
textData = []

for pages in fileReader.pages:
    theText = pages.extractText()

    # for char in theText:
    #   theText.replace(char, "
")

    textData.append(theText)

final_list = []

for i in textData:
    if i in stopwords.words('english'):
        textData.remove(i)
    final_list.append(i.strip('
'))

# filtered_word_list = final_list[:] #make a copy of the word_list

# for word in final_list: # iterate over word_list
#   if word in stopwords.words('english'):
#       final_list.remove(word) # remove word from filtered_word_list if it is a stopword

# filtered_words = [word for word in final_list if word not in stop_words]

# [s.strip('
') for s in theText]
# [s.replace('
', '') for s in theText]


# text_data = []

# for elem in textData:
#         text_data.extend(elem.strip().split('n'))  

# for line in textData:
#     textData.append(line.strip().split('
'))
#--------------------------------------------------------------------

import os.path

save_path = "C:\Users\Name\Documents\Data Analytics Club"

name_of_file = input("What is the name of the file: ")

completeName = os.path.join(save_path, name_of_file + ".txt")   

file1 = open(completeName, "w")

# file1.write(str(final_list))

for line in final_list:
    file1.write(line)

file1.close()

Remove Stopwords from List and Read to TXT with NLTK

Answers (1)

Related Questions