Quick Silver
Quick Silver

Reputation: 433

Finding md5 of files recursively in directory in python

I want to find the md5sum of files starting with "10" ( could be exe, doc, pdf etc) hence not checking the file extension but only the start two digits. So far I've a script to traverse through the directory and print out all such files but couldn't get the checksum to be printed for each of them:

def print_files(file_directory, file_extensions=['10']):                          
''' Print files in file_directory with extensions in file_extensions, recursively. '''

# Get the absolute path of the file_directory parameter                               
  file_directory = os.path.abspath(file_directory)                                      

# Get a list of files in file_directory                                               
  file_directory_files = os.listdir(file_directory)                                     

# Traverse through all files                                                          
  for filename in file_directory_files:                                                 
    filepath = os.path.join(file_directory, filename)                                 

    # Check if it's a normal file or directory                                        
    if os.path.isfile(filepath):                                                      

        # Check if the file has an extension of typical video files
        for file_extension in file_extensions:                     
            # Not a reqd file, ignore                              
            #if not filepath.endswith(file_extension):             
            if not filename.startswith(file_extension) or len(filename) != 19:
                continue                                                      

            # We have got a '10' file!                  
            print_files.counter += 1                                          


            ## TRYING TO READ AND PRINT MD5 USING HASHLIB/ DOESNT WORK###
            hasher = hashlib.md5()                                            
            with open(filename, 'rb') as afile:                               
               buf = afile.read(65536)                                        
               while len(buf) > 0:                                            
                   hasher.update(buf)                                         
                   buf = afile.read(65536)                                    


            # Print it's name                                                 
            print('{0}'.format(filepath))                                     
            print hasher('{0}.format(filepath)').hexdigest() 
            print '\n'                                       
    elif os.path.isdir(filepath):                            
        # We got a directory, enter into it for further processing
        print_files(filepath)   
if __name__ == '__main__':                                                                

 # Directory argument supplied             
  if len(sys.argv) == 2:                                                        
    if os.path.isdir(sys.argv[1]):                                            
        file_directory = sys.argv[1]                                          
    else:                                                                     
        print('ERROR: "{0}" is not a directory.'.format(sys.argv[1]))         
        exit(1)                                                               
else:                                                                         
    # Set file directory to CWD                
    file_directory = os.getcwd()                                              

print('\n -- Looking for Required Files in "{0}" --   \n'.format(file_directory))

# Set the number of processed files equal to zero                             
print_files.counter = 0                                                       

# Start Processing                                                            
print_files(file_directory)                                                   

# We are done. Exit now.   

'

Upvotes: 2

Views: 6139

Answers (3)

Quick Silver
Quick Silver

Reputation: 433

Got it fixed with this line

print hashlib.md5(open('{0}'.format(filepath)).read()).hexdigest()

I wasnt reading the file but just passing hashlib.md5. Thanks Matt for the insight.

Upvotes: 0

Matt Davidson
Matt Davidson

Reputation: 738

I'd recommend that you do not solve this recursively, but instead make use of os.walk() to traverse the directory structure. The following code could be the body of your print_files function.

file_directory = os.path.abspath(file_directory)
paths_to_hash = []

for root, dirs, filenames in os.walk(file_directory, topdown=False):
    for i, dir in enumerate(dirs):
        for filename in filenames[i]:
            if filenames[:2] == '10':
                paths_to_hash += [os.path.abspath('{0}/{1}/{2}'.format(root, dir, filename)]

for path in paths_to_hash:
    hash = hashlib.md5(open(path, 'rb').read()).digest())
    print 'hash: {0} for path: {1}'.format(hash, path)

Upvotes: 3

CristiFati
CristiFati

Reputation: 41116

The line printing the hasher should be:

print('{0}'.format(hasher.hexdigest()))

Upvotes: 0

Related Questions