Python - how to optimize iterator in file parsing

Question

I get files that have NTFS audit permissions and I'm using Python to parse them. The raw CSV files list the path and then which groups have which access, such as this type of pattern:

E:\DIR A, CREATOR OWNER FullControl
E:\DIR A, Sales FullControl
E:\DIR A, HR Full Control
E:\DIR A\SUBDIR, Sales FullControl
E:\DIR A\SUBDIR, HR FullControl

My code parses the file to output this:

File Access for: E:\DIR A
CREATOR OWNER,FullControl
Sales,FullControl
HR,FullControl

File Access For: E:\DIR A\SUBDIR
Sales,FullControl
HR,FullControl

I'm new to generators but I'd like to use them to optimize my code. Nothing I've tried seems to work, so here is the original code (I know it's ugly). It works but it's very slow. The only way I can do this is by parsing out the paths first, put them in a list, make a set so that they're unique, then iterate over that list and match them with the path in the second list, and list all of the items it finds. Like I said, it's ugly but works.

import os, codecs, sys
reload(sys)
sys.setdefaultencoding('utf8') // to prevent cp-932 errors on screen

file = "aud.csv"
outfile = "access-2.csv"


filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as infile:
    for line in infile:
        newline = line.split(',')
        folder = newline[0].replace(""","")
        user = newline[1].replace(""","")
        filelist.append(folder)
        accesslist.append(folder+","+user)

newfl = sorted(set(filelist))

def makeFile():
 print "Starting, please wait"
 for i in range(1,len(newfl)):
  searchItem = str(newfl[i])
  with codecs.open(outfile,"a",'utf-8-sig') as output:
    outtext = ("
File access for: "+ searchItem + "
")
    output.write(outtext)
    for item in accesslist:
        searchBreak = item.split(",")
        searchTarg = searchBreak[0]
        if searchItem == searchTarg:
            searchBreaknew = searchBreak[1].replace("FSA-INC01S\","")
            searchBreaknew = str(searchBreaknew)
            # print(searchBreaknew)
            searchBreaknew = searchBreaknew.replace(" ",",")
            searchBreaknew = searchBreaknew.replace("CREATOR,OWNER","CREATOR OWNER")
            output.write(searchBreaknew)

How should I optimize this?

EDIT:

Here is an edited version. It works MUCH faster, though I'm sure it can still be fixed:

import os, codecs, sys, csv
reload(sys)
sys.setdefaultencoding('utf8')

file = "aud.csv"
outfile = "access-3.csv"


filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as csvinfile:
    auditfile = csv.reader(csvinfile, delimiter=",")
    for line in auditfile:
        folder = line[0]
        user = line[1].replace("FSA-INC01S\","")
        filelist.append(folder)
        accesslist.append(folder+","+user)

newfl = sorted(set(filelist))

def makeFile():
 print "Starting, please wait"
 for i in xrange(1,len(newfl)):
  searchItem = str(newfl[i])
  outtext = ("
File access for: "+ searchItem + "
")
  accessUserlist = ""
  for item in accesslist:
        searchBreak = item.split(",")
        if searchItem == searchBreak[0]:
            searchBreaknew = str(searchBreak[1]).replace(" ",",")
            searchBreaknew = searchBreaknew.replace("R,O","R O")
            accessUserlist += searchBreaknew+"
"
  with codecs.open(outfile,"a",'utf-8-sig') as output:
    output.write(outtext)
    output.write(accessUserlist)

stovfl · Accepted Answer

I'm misguided from your used .csv file extension.
Your given expected output isn't compatible with csv, as inside a record no possible.
Proposal using a generator returning record by record:

class Audit(object):
    def __init__(self, fieldnames):
        self.fieldnames = fieldnames
        self.__access = {}

    def append(self, row):
        folder = row[self.fieldnames[0]]
        access = row[self.fieldnames[1]].strip(' ')
        access = access.replace("FSA-INC01S\", "")
        access = access.split(' ')
        if len(access) == 3:
            if access[0] == 'CREATOR':
                access[0] += ' ' + access[1]
                del access[1];
            elif access[1] == 'Full':
                access[1] += ' ' + access[2]
                del access[2];

        if folder not in self.__access:
            self.__access[folder] = []

        self.__access[folder].append(access)

    # Generator for class Audit
    def __iter__(self):
        record = ''
        for folder in sorted(self.__access):
            record = folder + '
'
            for access in self.__access[folder]:
                record += '%s
' % (','.join(access) )

            yield record + '
'

How to use it:

def main():
    import io, csv
    audit = Audit(['Folder', 'Accesslist'])

    with io.open(file, "r", encoding='utf-8') as csc_in:
        for row in csv.DictReader(csc_in, delimiter=","):
            audit.append(row)

    with io.open(outfile, 'w', newline='', encoding='utf-8') as txt_out:
        for record in audit:
            txt_out.write(record)

Tested with Python:3.4.2 - csv:1.0

Python - how to optimize iterator in file parsing

Answers (1)

Related Questions