Reputation: 61
so I have some code here that loops through 10 files in a directory. Each file has perhaps thousands of lines. The code then filters some of the words out of these files line by line. I understand that this can take a while but could my code be improved in some way to make this process faster. Am I making a coding mistake somewhere that causes a bottleneck? Any help or advice would be greatly appreciated! Here is my code:
import os
def remove_stop_words(string, stopwords_list):
string_to_list = string.split()
x = (' '.join(i for i in string_to_list if i.lower() not in (x.lower() for x in stopwords_list)))
x = x+'\n'
return x
def get_stop_words_list(stopwords_path):
with open(stopwords_path, 'r') as f:
stopwords = f.read().split()
return stopwords
def main():
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/test/'
stop_words_path = 'C:/Users/User/Desktop/NLTK-stop-word-list.txt'
stopwords = get_stop_words_list(stop_words_path)
#print(stopwords)
for root, dirs, files in os.walk(input_location):
for name in files:
file_path = os.path.join(root, name) # joins the new path of the file to the current file in order to access the file
with open(file_path, 'r') as f: # open the file
for line in f: # read file line by line
x = remove_stop_words(line,stopwords)
new_file_path = os.path.join(output_location, name) + '_filtered' # creates a new file of the file that is currenlty being filtered of stopwords
with open(new_file_path, 'a') as output_file: # opens output file
output_file.write(x) # writes the newly filtered text to the new output file
if __name__ == "__main__":
main()
Upvotes: 2
Views: 57
Reputation: 985
Here's a solution for writing file by file, rather than line by line:
for root, dirs, files in os.walk(input_location):
for name in files:
file_path = os.path.join(root, name) # joins the new path of the file to the current file in order to access the file
filestring = ''
with open(file_path, 'r') as f: # open the file
for line in f: # read file line by line
x = remove_stop_words(line,stopwords)
filestring+=x
filestring+='\n' #Create new line
new_file_path = os.path.join(output_location, name) + '_filtered' # creates a new file of the file that is currenlty being filtered of stopwords
with open(new_file_path, 'a') as output_file: # opens output file
output_file.write(filestring) # writes the newly filtered text to the new output file
Upvotes: 2