Reputation:
I'm trying to remove directories from os.walk (I don't need the files from those dirs)
My code:
def findit(root, exclude_files=[], exclude_dirs=[]):
exclude_files = (fnmatch.translate(i) for i in exclude_files)
exclude_files = '('+')|('.join(exclude_files)+')'
exclude_files = re.compile(exclude_files)
exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
return (os.path.join(r,f)
for r,_,f in os.walk(root)
if os.path.normpath(os.path.normcase(r)) not in exclude_dirs
for f in f
if not exclude_files.match(os.path.normcase(f)))
It works filtering the files, when I try to filter out c:/windows it will still show my files from windows dirs am I missing something?
filelist = list(findit('c:/',exclude_files = ['*.dll', '*.dat', '*.log', '*.exe'], exclude_dirs = ['c:/windows', 'c:/program files', 'c:/else']))
Upvotes: 0
Views: 2234
Reputation: 9
You can use the keyword "continue" to skip the iteration while traversing using os.walk("pathName")
for dirpath, dirnames, filenames in os.walk(pathName):
# Write regular expression or a string to skip the desired folder
dirpath_pat = re.search(pattern, dirpath)
if dirpath_pat:
if dirpath_pat.group(0):
continue
Upvotes: 0
Reputation: 1253
Reading the reply above made me wonder. Seemed to me the os.walk was missing and the root parameter did not seem to be used as needed. Also, the case of either of the optional arguments being the empty list should work. Suggesting a slight variation with less namespace look-up and exclude wildcards for directories at each directory level:
import os
import re
import fnmatch
import os.path
def findit(root, exclude_files=[], exclude_dirs=[], exclude_dirs_wc=[]):
"""Generate all files found under root excluding some.
Excluded files are given as a list of Unix shell-style wildcards
that exclude matches in each directory. Excluded directories are
assumed to be paths starting at root; no wildcards. Directory
wildcards at each level can be supplied.
"""
# Less namespace look-up.
join = os.path.join
normpath = os.path.normpath; normcase = os.path.normcase
#
def make_exclude_regex_from(lst):
if len(lst):
lst = (fnmatch.translate(i) for i in lst)
lst = "({})".format(")|(".join(lst))
lst = re.compile(lst)
return lst
#
exclude_files = make_exclude_regex_from(exclude_files)
exclude_dirs_wc = make_exclude_regex_from(exclude_dirs_wc)
if len(exclude_dirs):
exclude_dirs = (normpath(i) for i in exclude_dirs)
exclude_dirs = (normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
for current, dirs, files in os.walk(root):
current_dir = normpath(normcase(current))
if exclude_dirs and current_dir in exclude_dirs:
# Prune set of dirs to exclude.
exclude_dirs.discard(current_dir)
# Disregard sub-directories.
dirs[:] = [] # IN PLACE, since it is a loop var.
continue
if exclude_dirs_wc:
for dd in dirs[:]:
if exclude_dirs_wc.match(normcase(dd)):
dirs.remove(dd) # IN PLACE
if exclude_files:
for ff in files[:]:
if exclude_files.match(normcase(ff)):
files.remove(ff) # IN PLACE; also a loop var.
for f in files:
yield join(current,f)
Upvotes: 0
Reputation: 1124100
When filtering out directories, you are not preventing os.walk()
from going into subdirectories.
You'll need to clear the dirs
list for this to happen:
def findit(root, exclude_files=[], exclude_dirs=[]):
exclude_files = (fnmatch.translate(i) for i in exclude_files)
exclude_files = '('+')|('.join(exclude_files)+')'
exclude_files = re.compile(exclude_files)
exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
for current, dirs, files in os.walk(root):
if os.path.normpath(os.path.normcase(current)) in exclude_dirs:
# exclude this dir and subdirectories
dirs[:] = []
continue
for f in files:
if not exclude_files.match(os.path.normcase(f)):
yield os.path.join(current, f)
The dirs[:] = []
assignment clears the list in place; it removes all dirnames from the list. As this list is shared with os.walk()
and the latter uses this list to subsequently visit sub-directories, this effectively stops os.walk()
from visiting those subdirectories.
Upvotes: 4