Reputation: 753
I have two zip files. I want to see if everything (file name and each file's contents in the zip) are the same.
There is a similar question. But the answer does not support zip files.
Anyone has a good idea?
Upvotes: 3
Views: 3820
Reputation: 4418
Here's my stab at it. It may be sufficient to just make sure the ZipFiles contain the same items and that the items have matching CRC32s. (What is the chance that two ZipFiles being compared have files with the same name and same CRC32 but are different files?) If that is good enough, omit the loop that compares the file contents.
from zipfile import ZipFile
BUFSIZE = 1024
def are_equivalent(filename1, filename2):
"""Compare two ZipFiles to see if they would expand into the same directory structure
without actually extracting the files.
"""
with ZipFile(filename1, 'r') as zip1, ZipFile(filename2, 'r') as zip2:
# Index items in the ZipFiles by filename. For duplicate filenames, a later
# item in the ZipFile will overwrite an ealier item; just like a later file
# will overwrite an earlier file with the same name when extracting.
zipinfo1 = {info.filename:info for info in zip1.infolist()}
zipinfo2 = {info.filename:info for info in zip2.infolist()}
# Do some simple checks first
# Do the ZipFiles contain the same the files?
if zipinfo1.keys() != zipinfo2.keys():
return False
# Do the files in the archives have the same CRCs? (This is a 32-bit CRC of the
# uncompressed item. Is that good enough to confirm the files are the same?)
if any(zipinfo1[name].CRC != zipinfo2[name].CRC for name in zipinfo1.keys()):
return False
# Skip/omit this loop if matching names and CRCs is good enough.
# Open the corresponding files and compare them.
for name in zipinfo1.keys():
# 'ZipFile.open()' returns a ZipExtFile instance, which has a 'read()' method
# that accepts a max number of bytes to read. In contrast, 'ZipFile.read()' reads
# all the bytes at once.
with zip1.open(zipinfo1[name]) as file1, zip2.open(zipinfo2[name]) as file2:
while True:
buffer1 = file1.read(BUFSIZE)
buffer2 = file2.read(BUFSIZE)
if buffer1 != buffer2:
return False
if not buffer1:
break
return True
Upvotes: 3
Reputation: 106
I tried using zipfile builtin module in python.
from zipfile import ZipFile
def compare(file1, file2):
try:
with ZipFile(file1, 'r') as file:
f1 = str([x for x in file.infolist()])
with ZipFile(file2, 'r') as file:
f2 = str([x for x in file.infolist()])
return f1 == f2
except FileNotFoundError:
return f"Either file at {file1} or {file2} does not exist"
f = compare(file1='storage/1.zip', file2='storage/2.zip')
print(f)
I don't know this is correct approach or not (if it is not please correct me)
Upvotes: 0
Reputation: 753
Seems zip
will have different hashes even you zip identical items. We can divide it two parts: first is to unzip, second is to compare folders after unzip.
import os
import filecmp
import zipfile
def are_dir_trees_equal(dir1, dir2):
dirs_cmp = filecmp.dircmp(dir1, dir2)
if len(dirs_cmp.left_only)>0 or len(dirs_cmp.right_only)>0 or \
len(dirs_cmp.funny_files)>0:
return False
(_, mismatch, errors) = filecmp.cmpfiles(
dir1, dir2, dirs_cmp.common_files, shallow=False)
if len(mismatch)>0 or len(errors)>0:
return False
for common_dir in dirs_cmp.common_dirs:
new_dir1 = os.path.join(dir1, common_dir)
new_dir2 = os.path.join(dir2, common_dir)
if not are_dir_trees_equal(new_dir1, new_dir2):
return False
return True
BASE_PATH = '/Users/Documents/'
model1 = os.path.join(BASE_PATH, 'test1.zip')
model2 = os.path.join(BASE_PATH, 'test2.zip')
with zipfile.ZipFile(model1,"r") as zip_ref:
zip_ref.extractall(BASE_PATH)
with zipfile.ZipFile(model2,"r") as zip_ref:
zip_ref.extractall(BASE_PATH)
folder1 = model1.split('.')[0]
folder2 = model2.split('.')[0]
is_equal = are_dir_trees_equal(folder1, folder2)
print(is_equal)
Upvotes: 0