Reputation: 15
I am trying to write a class that search for all files on my computer with a specific extension. To make the process faster I use threading. It searchs all hard disks at the same time.
I know it finds all the paths when I print them print(file_path)
But the values won't be appended in the self.ALLFILES
and I do not know why.
Here is the code:
from concurrent import futures
import time
import win32api
import os
class SearchThreader():
def __init__(self):
self.allfiles = []
self.harddisks = win32api.GetLogicalDriveStrings().split('\000')[:-1]
#skip the folders that shouldn't have files with this extension
self.exlude = {
"$SysReset", "AMD", "inetpub", "NVIDIA", "PerfLogs",
"Windows.old", "Windows", "ProgrammData",
"Programm Files (x86)", "Programm Files",
"Doc", "Fotos", "Lib", "lib", "libs"
"Scripts", "Tools", "bin", "Config", "Logs", "log",
"mods", "win"
}
self.fullThreadSearch()
def SearchHarddisk(self, hd):
for root, dirs, files in os.walk(hd, topdown=True):
dirs[:] = [d for d in dirs if d not in self.exlude]
for f_name in files:
file_path = os.path.join(root, f_name)
if file_path.endswith(".mp3"):
self.allfiles.append(file_path)
print(file_path)
def fullThreadSearch(self):
with futures.ProcessPoolExecutor(max_workers=len(self.harddisks)) as thr:
for harddisk in self.harddisks:
thr.submit(self.SearchHarddisk, harddisk)
if __name__ == "__main__":
starttime = time.time()
ST = SearchThreader()
print(ST.allfiles)
print(time.time() - starttime)
Upvotes: 0
Views: 65
Reputation: 123393
As mentioned in @Trap's answer, you need to return the results from the SearchHarddisk()
method instead of trying to append them to the self.allfiles
in fullThreadSearch()
. This is because each invocation of SearchHarddisk()
runs in its own address space, so there's effectively a different self.allfiles
list object in each one.
Here's something with those changes made that seems to work on my Windows machine. Note that I based it on the sample code shown in the ProcessPoolExecutor Example section of the documentation which uses the ProcessPoolExecutor.map()
method instead of calling ProcessPoolExecutor.submit()
repeatedly.
import concurrent.futures as futures
import os
import time
import win32api
class SearchThreader():
def __init__(self):
self.allfiles = []
self.harddisks = win32api.GetLogicalDriveStrings().split('\000')[:-1]
#skip the folders that shouldn't have files with this extension
self.exlude = {
"$SysReset", "AMD", "inetpub", "NVIDIA", "PerfLogs",
"Windows.old", "Windows", "ProgrammData",
"Programm Files (x86)", "Programm Files",
"Doc", "Fotos", "Lib", "lib", "libs"
"Scripts", "Tools", "bin", "Config", "Logs", "log",
"mods", "win"
}
self.fullThreadSearch()
def SearchHarddisk(self, hd):
allfiles = [] # Local variable.
for root, dirs, files in os.walk(hd, topdown=True):
dirs[:] = [d for d in dirs if d not in self.exlude]
for f_name in files:
file_path = os.path.join(root, f_name)
if file_path.endswith(".mp3"):
allfiles.append(file_path) # Append to local list.
print(file_path)
return allfiles # Return all found on this harddisk.
def fullThreadSearch(self):
with futures.ProcessPoolExecutor() as executor:
for harddisk, matching_files in zip(
self.harddisks, executor.map(self.SearchHarddisk, self.harddisks)):
print('harddisk: {}, matching_files: {}'.format(harddisk, matching_files))
self.allfiles.extend(matching_files)
if __name__ == "__main__":
starttime = time.time()
ST = SearchThreader()
print(ST.allfiles)
print(time.time() - starttime)
Upvotes: 1
Reputation: 228
I've never used the ProcessPoolExecutor class, but I think your error is due to the fact that self.allfiles is not shared across the processes created. Your SearchHarddisk method should return a value, and after the process are done, you have to gather every results and append them to self.allfiles. This is what I would have done, but since I'm not running Windows, I can't test so I'm not sure it will work.
from concurrent import futures
import time
import win32api
import os
class SearchThreader():
def __init__(self):
self.allfiles = []
self.harddisks = win32api.GetLogicalDriveStrings().split('\000')[:-1]
#skip the folders that shouldn't have files with this extension
self.exlude = {
"$SysReset", "AMD", "inetpub", "NVIDIA", "PerfLogs",
"Windows.old", "Windows", "ProgrammData",
"Programm Files (x86)", "Programm Files",
"Doc", "Fotos", "Lib", "lib", "libs"
"Scripts", "Tools", "bin", "Config", "Logs", "log",
"mods", "win"
}
self.fullThreadSearch()
def SearchHarddisk(self, hd):
files = []
for root, dirs, files in os.walk(hd, topdown=True):
dirs[:] = [d for d in dirs if d not in self.exlude]
for f_name in files:
file_path = os.path.join(root, f_name)
if file_path.endswith(".mp3"):
files.append(file_path)
print(file_path)
return files
def fullThreadSearch(self):
with futures.ProcessPoolExecutor(max_workers=len(self.harddisks)) as thr:
future_objects = [thr.submit(self.SearchHarddisk, harddisk) for harddisk in self.harddisks]
self.allfiles = [future.result() for future in future_objects]
if __name__ == "__main__":
starttime = time.time()
ST = SearchThreader()
print(ST.allfiles)
print(time.time() - starttime)
Upvotes: 0