Reputation: 527

How can I calculate a hash for a filesystem-directory using Python?

I'm using this code to calculate hash value for a file:

m = hashlib.md5()
with open("calculator.pdf", 'rb') as fh:
    while True:
        data = fh.read(8192)
        if not data:
            break
        m.update(data)
    hash_value = m.hexdigest()

    print  hash_value

when I tried it on a folder "folder"I got

IOError: [Errno 13] Permission denied: folder

How could I calculate the hash value for a folder ?

Upvotes: 33

Answers (9)

winderland

Reputation: 528

Here is a method where you can select what file types to hash within a path recursively:

import pathlib
import hashlib

def getHashForFileTypesInPath(path, listFileTypes):
    print(f"Generating hash for files matching {listFileTypes} in path {path}")
    # Get list of all paths that mach types in path recursively
    matchedPaths = []
    for f in listFileTypes:
        matchedPaths.extend(list(path.rglob(f)))
    matchedPaths.sort()
    
    # Get hash of each matched file. 
    # Keep updating hash object with each file's hash to ultimately get a
    #     single combined hash for all files in path
    # If any file's hash changes, overall combined hash will change.
    m = hashlib.sha1()
    for p in matchedPaths:
        digest = hashlib.file_digest(open(p, 'rb'), 'sha1')
        m.update(digest.digest())
    hashSha1 = m.hexdigest()
    return hashSha1


path = pathlib.Path(r"C:\temp\temp")
# listFileTypes = ['*.jpg', '*.pdf'] # Can provide a list of types
listFileTypes = ['*.*'] # Can hash every type
hashSha1 = getHashForFileTypesInPath(path, listFileTypes)
print(f"hash: {hashSha1}")

Output:

Generating hash for files matching ['*.*'] in path C:\temp\temp
hash: 621714d8751c2f7471cd81cc664c98299ba50ec2

Upvotes: 1

Hongbo Miao

Reputation: 49986

Thank you so much @danmou answer! I did two more improvements to further speed up.

Update byte_block to 1 MiB (further increase has no speed up)
Change MD5 to XXH128 which is about 50 times faster than MD5

from pathlib import Path

import xxhash


def calculate_file_xxh128(
    file_path: Path, xxh128_hash: xxhash.xxh3_128
) -> xxhash.xxh3_128:
    with open(file_path, "rb") as file:
        # Read 1 MiB at a time to use less memory
        while byte_block := file.read(1024 * 1024):
            xxh128_hash.update(byte_block)
    return xxh128_hash

def calculate_dir_xxh128(
    dir_path: Path, xxh128_hash: xxhash.xxh3_128
) -> xxhash.xxh3_128:
    for path in sorted(Path(dir_path).iterdir(), key=lambda p: str(p).lower()):
        xxh128_hash.update(path.name.encode())
        if path.is_file():
            xxh128_hash = calculate_file_xxh128(path, xxh128_hash)
        elif path.is_dir():
            xxh128_hash = calculate_dir_xxh128(path, xxh128_hash)
    return xxh128_hash

def get_file_xxh128(file_path: Path) -> str:
    return calculate_file_xxh128(file_path, xxhash.xxh128()).hexdigest()

def get_dir_xxh128(dir_path: Path) -> str:
    return calculate_dir_xxh128(dir_path, xxhash.xxh128()).hexdigest()

Usage

file_xxh128 = calculate_file_xxh128(path_to_file)
dir_xxh128 = calculate_dir_xxh128(path_to_dir)

The reason to update from MD5 to XXH128 is based on this benchmark. I compared XXH128 and MD5 speed locally which gets same result. XXH128 is slightly slower than XXH3, yet it boasts greater future-proofing, offering an even lower chance of collisions.

Hash Name	Width	Bandwidth (GB/s)	Small Data Velocity	Quality	Comment
XXH3 (SSE2)	64	31.5 GB/s	133.1	10
XXH128 (SSE2)	128	29.6 GB/s	118.1	10
RAM sequential read	N/A	28.0 GB/s	N/A	N/A	for reference
City64	64	22.0 GB/s	76.6	10
T1ha2	64	22.0 GB/s	99.0	9	Slightly worse [collisions]
City128	128	21.7 GB/s	57.7	10
XXH64	64	19.4 GB/s	71.0	10
SpookyHash	64	19.3 GB/s	53.2	10
Mum	64	18.0 GB/s	67.0	9	Slightly worse [collisions]
XXH32	32	9.7 GB/s	71.9	10
City32	32	9.1 GB/s	66.0	10
Murmur3	32	3.9 GB/s	56.1	10
SipHash	64	3.0 GB/s	43.2	10
FNV64	64	1.2 GB/s	62.7	5	Poor avalanche properties
Blake2	256	1.1 GB/s	5.1	10	Cryptographic
SHA1	160	0.8 GB/s	5.6	10	Cryptographic but broken
MD5	128	0.6 GB/s	7.8	10	Cryptographic but broken

Upvotes: 1

danmou

Reputation: 371

Here is an implementation that uses pathlib.Path instead of relying on os.walk. It sorts the directory contents before iterating so it should be repeatable on multiple platforms. It also updates the hash with the names of files/directories, so adding empty files and directories will change the hash.

Version with type annotations (Python 3.6 or above):

import hashlib
from _hashlib import HASH as Hash
from pathlib import Path
from typing import Union


def md5_update_from_file(filename: Union[str, Path], hash: Hash) -> Hash:
    assert Path(filename).is_file()
    with open(str(filename), "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash.update(chunk)
    return hash


def md5_file(filename: Union[str, Path]) -> str:
    return str(md5_update_from_file(filename, hashlib.md5()).hexdigest())


def md5_update_from_dir(directory: Union[str, Path], hash: Hash) -> Hash:
    assert Path(directory).is_dir()
    for path in sorted(Path(directory).iterdir(), key=lambda p: str(p).lower()):
        hash.update(path.name.encode())
        if path.is_file():
            hash = md5_update_from_file(path, hash)
        elif path.is_dir():
            hash = md5_update_from_dir(path, hash)
    return hash


def md5_dir(directory: Union[str, Path]) -> str:
    return str(md5_update_from_dir(directory, hashlib.md5()).hexdigest())

Without type annotations:

import hashlib
from pathlib import Path


def md5_update_from_file(filename, hash):
    assert Path(filename).is_file()
    with open(str(filename), "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash.update(chunk)
    return hash


def md5_file(filename):
    return md5_update_from_file(filename, hashlib.md5()).hexdigest()


def md5_update_from_dir(directory, hash):
    assert Path(directory).is_dir()
    for path in sorted(Path(directory).iterdir(), key=lambda p: str(p).lower()):
        hash.update(path.name.encode())
        if path.is_file():
            hash = md5_update_from_file(path, hash)
        elif path.is_dir():
            hash = md5_update_from_dir(path, hash)
    return hash


def md5_dir(directory):
    return md5_update_from_dir(directory, hashlib.md5()).hexdigest()

Condensed version if you only need to hash directories:

def md5_update_from_dir(directory, hash):
    assert Path(directory).is_dir()
    for path in sorted(Path(directory).iterdir(), key=lambda p: str(p).lower()):
        hash.update(path.name.encode())
        if path.is_file():
            with open(path, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash.update(chunk)
        elif path.is_dir():
            hash = md5_update_from_dir(path, hash)
    return hash


def md5_dir(directory):
    return md5_update_from_dir(directory, hashlib.md5()).hexdigest()

Usage: md5_hash = md5_dir("/some/directory")

Upvotes: 25

Mangu Singh Rajpurohit

Reputation: 11420

Use checksumdir python package available for calculating checksum/hash of directory. It's available at https://pypi.python.org/pypi/checksumdir

Usage :

import checksumdir
hash = checksumdir.dirhash("c:\\temp")
print hash

Upvotes: 23

Eyal

Reputation: 41

use the checksumdir https://pypi.org/project/checksumdir/


directory  = '/path/to/directory/'
md5hash    = dirhash(directory, 'md5')

Upvotes: 2

Joe Flack

Reputation: 974

I have optimized further on Andy's response.

The following is a python3 rather than python2 implementation. It uses SHA1, handles some cases where encoding is needed, is linted, and includes some doctrings.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""dir_hash: Return SHA1 hash of a directory.
- Copyright (c) 2009 Stephen Akiki, 2018 Joe Flack
- MIT License (http://www.opensource.org/licenses/mit-license.php)
- http://akiscode.com/articles/sha-1directoryhash.shtml
"""
import hashlib
import os


def update_hash(running_hash, filepath, encoding=''):
    """Update running SHA1 hash, factoring in hash of given file.

    Side Effects:
        running_hash.update()
    """
    if encoding:
        file = open(filepath, 'r', encoding=encoding)
        for line in file:
            hashed_line = hashlib.sha1(line.encode(encoding))
            hex_digest = hashed_line.hexdigest().encode(encoding)
            running_hash.update(hex_digest)
        file.close()
    else:
        file = open(filepath, 'rb')
        while True:
            # Read file in as little chunks.
            buffer = file.read(4096)
            if not buffer:
                break
            running_hash.update(hashlib.sha1(buffer).hexdigest())
        file.close()


def dir_hash(directory, verbose=False):
    """Return SHA1 hash of a directory.

    Args:
        directory (string): Path to a directory.
        verbose (bool): If True, prints progress updates.

    Raises:
        FileNotFoundError: If directory provided does not exist.

    Returns:
        string: SHA1 hash hexdigest of a directory.
    """
    sha_hash = hashlib.sha1()

    if not os.path.exists(directory):
        raise FileNotFoundError

    for root, dirs, files in os.walk(directory):
        for names in files:
            if verbose:
                print('Hashing', names)
            filepath = os.path.join(root, names)
            try:
                update_hash(running_hash=sha_hash,
                            filepath=filepath)
            except TypeError:
                update_hash(running_hash=sha_hash,
                            filepath=filepath,
                            encoding='utf-8')

    return sha_hash.hexdigest()

Upvotes: 1

Bryson Tyrrell

Reputation: 435

I'm not a fan of how the recipe referenced in the answer was written. I have a much simpler version that I'm using:

import hashlib
import os


def hash_directory(path):
    digest = hashlib.sha1()

    for root, dirs, files in os.walk(path):
        for names in files:
            file_path = os.path.join(root, names)

            # Hash the path and add to the digest to account for empty files/directories
            digest.update(hashlib.sha1(file_path[len(path):].encode()).digest())

            # Per @pt12lol - if the goal is uniqueness over repeatability, this is an alternative method using 'hash'
            # digest.update(str(hash(file_path[len(path):])).encode())

            if os.path.isfile(file_path):
                with open(file_path, 'rb') as f_obj:
                    while True:
                        buf = f_obj.read(1024 * 1024)
                        if not buf:
                            break
                        digest.update(buf)

    return digest.hexdigest()

I found exceptions were usually being thrown whenever something like an alias was encountered (shows up in the os.walk(), but you can't directly open it). The os.path.isfile() check takes care of those issues.

If there were to be an actual file within a directory I'm attempting to hash and it couldn't be opened, skipping that file and continuing is not a good solution. That affects the outcome of the hash. Better to kill the hash attempt altogether. Here, the try statement would be wrapped around the call to my hash_directory() function.

>>> try:
...   print(hash_directory('/tmp'))
... except:
...   print('Failed!')
... 
e2a075b113239c8a25c7e1e43f21e8f2f6762094
>>>

Upvotes: 7

Andy

Reputation: 50640

This Recipe provides a nice function to do what you are asking. I've modified it to use the MD5 hash, instead of the SHA1, as your original question asks

def GetHashofDirs(directory, verbose=0):
  import hashlib, os
  SHAhash = hashlib.md5()
  if not os.path.exists (directory):
    return -1

  try:
    for root, dirs, files in os.walk(directory):
      for names in files:
        if verbose == 1:
          print 'Hashing', names
        filepath = os.path.join(root,names)
        try:
          f1 = open(filepath, 'rb')
        except:
          # You can't open the file for some reason
          f1.close()
          continue

        while 1:
          # Read file in as little chunks
          buf = f1.read(4096)
          if not buf : break
          SHAhash.update(hashlib.md5(buf).hexdigest())
        f1.close()

  except:
    import traceback
    # Print the stack traceback
    traceback.print_exc()
    return -2

  return SHAhash.hexdigest()

You can use it like this:

print GetHashofDirs('folder_to_hash', 1)

The output looks like this, as it hashes each file:

...
Hashing file1.cache
Hashing text.txt
Hashing library.dll
Hashing vsfile.pdb
Hashing prog.cs
5be45c5a67810b53146eaddcae08a809

The returned value from this function call comes back as the hash. In this case, 5be45c5a67810b53146eaddcae08a809

Upvotes: 10

omichael

Reputation: 31

I keep seeing this code propagated through various forums.

The ActiveState recipe answer works but, as Antonio pointed out, it is not guaranteed to be repeatable across filesystems, due to not being able to present the files in the same order (try it). One fix is to change

for root, dirs, files in os.walk(directory):
  for names in files:

for root, dirs, files in os.walk(directory):
  for names in sorted(files):

(Yes I'm being lazy here. This sorts the filenames only and not the directories. The same principle applies)

Upvotes: 3

How can I calculate a hash for a filesystem-directory using Python?

Answers (9)

Usage

Related Questions