user861555
user861555

Reputation:

Python OS.WALK Remove Directories

I'm trying to remove directories from os.walk (I don't need the files from those dirs)

My code:

def findit(root, exclude_files=[], exclude_dirs=[]):
    exclude_files = (fnmatch.translate(i) for i in exclude_files)
    exclude_files = '('+')|('.join(exclude_files)+')'
    exclude_files = re.compile(exclude_files)
    exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
    exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
    exclude_dirs = set(exclude_dirs)
    return (os.path.join(r,f)
           for r,_,f in os.walk(root)
           if os.path.normpath(os.path.normcase(r)) not in exclude_dirs
           for f in f
           if not exclude_files.match(os.path.normcase(f)))

It works filtering the files, when I try to filter out c:/windows it will still show my files from windows dirs am I missing something?

   filelist = list(findit('c:/',exclude_files = ['*.dll', '*.dat', '*.log', '*.exe'], exclude_dirs = ['c:/windows', 'c:/program files', 'c:/else']))

Upvotes: 0

Views: 2234

Answers (3)

You can use the keyword "continue" to skip the iteration while traversing using os.walk("pathName")

for dirpath, dirnames, filenames in os.walk(pathName):
    # Write regular expression or a string to skip the desired folder
    dirpath_pat = re.search(pattern, dirpath)
    if dirpath_pat:
        if dirpath_pat.group(0):
            continue

Upvotes: 0

FredrikHedman
FredrikHedman

Reputation: 1253

Reading the reply above made me wonder. Seemed to me the os.walk was missing and the root parameter did not seem to be used as needed. Also, the case of either of the optional arguments being the empty list should work. Suggesting a slight variation with less namespace look-up and exclude wildcards for directories at each directory level:

import os
import re
import fnmatch
import os.path


def findit(root, exclude_files=[], exclude_dirs=[], exclude_dirs_wc=[]):
    """Generate all files found under root excluding some.

    Excluded files are given as a list of Unix shell-style wildcards
    that exclude matches in each directory.  Excluded directories are
    assumed to be paths starting at root; no wildcards.  Directory
    wildcards at each level can be supplied.

    """
    # Less namespace look-up.
    join = os.path.join
    normpath = os.path.normpath; normcase = os.path.normcase
    #
    def make_exclude_regex_from(lst):
        if len(lst):
            lst = (fnmatch.translate(i) for i in lst)
            lst = "({})".format(")|(".join(lst))
            lst = re.compile(lst)
        return lst
    #
    exclude_files = make_exclude_regex_from(exclude_files)
    exclude_dirs_wc = make_exclude_regex_from(exclude_dirs_wc)
    if len(exclude_dirs):
        exclude_dirs = (normpath(i) for i in exclude_dirs)
        exclude_dirs = (normcase(i) for i in exclude_dirs)
        exclude_dirs = set(exclude_dirs)
    for current, dirs, files in os.walk(root):
        current_dir = normpath(normcase(current))
        if exclude_dirs and current_dir in exclude_dirs:
            # Prune set of dirs to exclude.
            exclude_dirs.discard(current_dir)
            # Disregard sub-directories.
            dirs[:] = []  # IN PLACE, since it is a loop var.
            continue
        if exclude_dirs_wc:
            for dd in dirs[:]:
                if exclude_dirs_wc.match(normcase(dd)):
                    dirs.remove(dd)  # IN PLACE
        if exclude_files:
            for ff in files[:]:
                if exclude_files.match(normcase(ff)):
                    files.remove(ff)  # IN PLACE; also a loop var.
        for f in files:
            yield join(current,f)

Upvotes: 0

Martijn Pieters
Martijn Pieters

Reputation: 1124100

When filtering out directories, you are not preventing os.walk() from going into subdirectories.

You'll need to clear the dirs list for this to happen:

def findit(root, exclude_files=[], exclude_dirs=[]):
    exclude_files = (fnmatch.translate(i) for i in exclude_files)
    exclude_files = '('+')|('.join(exclude_files)+')'
    exclude_files = re.compile(exclude_files)
    exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
    exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
    exclude_dirs = set(exclude_dirs)
    for current, dirs, files in os.walk(root):
        if os.path.normpath(os.path.normcase(current)) in exclude_dirs:
            # exclude this dir and subdirectories
            dirs[:] = []
            continue
        for f in files:
            if not exclude_files.match(os.path.normcase(f)):
                yield os.path.join(current, f)

The dirs[:] = [] assignment clears the list in place; it removes all dirnames from the list. As this list is shared with os.walk() and the latter uses this list to subsequently visit sub-directories, this effectively stops os.walk() from visiting those subdirectories.

Upvotes: 4

Related Questions