Reputation: 118
I have an issue creating function that compare two zip files(if they are the same, not only by name). Here is example of my code:
def validate_zip_files(self):
host = '192.168.0.1'
port = 2323
username = '123'
password = '123'
ftp = FTP()
ftp.connect(host, port)
ftp.login(username,password)
ftp.cwd('test')
print ftp.pwd()
ftp.retrbinary('RETR test', open('test.zip', 'wb').write)
file1=open('test.zip', 'wb')
file2=open('/home/user/file/text.zip', 'wb')
return filecmp.cmp(file1, file2, shallow=True)
One of the problems is that the second zip is in different location('/home/user/file/text.zip') and i am downloading the zip file in the dir where my python script is. I am not 100% sure that filecmp.cmp works with .zip files.
Any ideas would be great :) Thanks.
Upvotes: 5
Views: 4419
Reputation: 450
If you want performance, you might want to open both the files and compare no. of files, names and crc already embedded in the zip file using zipfile module
Will be significantly faster for large files.
import zipfile
def compare_zip_crc(zipfile1, zipfile2):
with zipfile.ZipFile(zipfile1, 'r') as zip1, zipfile.ZipFile(zipfile2, 'r') as zip2:
if len(zip1.namelist()) != len(zip2.namelist()):
return False # not same if number of files is not same
if zip1.namelist() != zip2.namelist():
return False # check if all names are same
for crc1, crc2 in zip(zip1.infolist(), zip2.infolist()):
if crc1.CRC != crc2.CRC:
return False # check if crc is same
return True
# Example usage
zip_file1 = "/yourpath/file1.zip"
zip_file2 = "/yourpath/file2.zip"
if compare_zip_crc(zipfile1, zipfile2):
print("The ZIPs are identical.")
else:
print("The ZIPs are different.")
Upvotes: 0
Reputation: 1
For compare, I use this in my integration test:
def assert_zip_files_are_equal(filepath_a, filepath_b):
"""
Verify that two zip files are equal.
It compares the content of the zip files and the content of the files in the zip files.
"""
with ZipFile(filepath_a, "r") as zip_a:
ziped_files_a = sorted(zip_a.namelist())
with ZipFile(filepath_b, "r") as zip_b:
ziped_files_b = sorted(zip_b.namelist())
assert sorted(ziped_files_a) == sorted(ziped_files_b)
for ziped_filename in ziped_files_a:
with zip_a.open(ziped_filename) as file_a:
with zip_b.open(ziped_filename) as file_b:
assert file_a.read() == file_b.read()
Upvotes: 0
Reputation: 367
See my gist that compares two zip files by their contents, and generate patch file from one zip to the other. For example, if two zip files share one entry but with different content, my gist will be able to find it out; if they have different entries, the gist can also make it. The gist ignores difference in modification time. That said, however, if you only care about a shallow comparison, then hashlib
could be a better choice.
For your reference, code from the gist:
import os
import argparse
import collections
import tempfile
import zipfile
import filecmp
import shutil
import shlex
ZipCmpResult = collections.namedtuple('ZipCmpResult',
['to_rm', 'to_cmp', 'to_add'])
def make_parser():
parser = argparse.ArgumentParser(
description='Make patch zip file from two similar zip files.')
parser.add_argument(
'--oldfile',
default=os.path.join('share', 'old.zip'),
help='default: %(default)s')
parser.add_argument(
'--newfile',
default=os.path.join('share', 'new.zip'),
help='default: %(default)s')
parser.add_argument(
'--toname',
default=os.path.join('share', 'patch'),
help='default: %(default)s')
return parser
def zipcmp(old, new):
with zipfile.ZipFile(old) as zinfile:
old_names = set(zinfile.namelist())
with zipfile.ZipFile(new) as zinfile:
new_names = set(zinfile.namelist())
to_rm = old_names - new_names
to_cmp = old_names & new_names
to_add = new_names - old_names
return ZipCmpResult(to_rm, to_cmp, to_add)
def compare_files(old, new, cmpresult):
with tempfile.TemporaryDirectory() as tmpdir, \
zipfile.ZipFile(old) as zinfile_old, \
zipfile.ZipFile(new) as zinfile_new:
old_dest = os.path.join(tmpdir, 'old')
new_dest = os.path.join(tmpdir, 'new')
os.mkdir(old_dest)
os.mkdir(new_dest)
for filename in cmpresult.to_cmp:
zinfile_old.extract(filename, path=old_dest)
zinfile_new.extract(filename, path=new_dest)
if not filecmp.cmp(
os.path.join(old_dest, filename),
os.path.join(new_dest, filename),
shallow=False):
cmpresult.to_add.add(filename)
def mkpatch(new, cmpresult, to_name):
with zipfile.ZipFile(new) as zinfile, \
zipfile.ZipFile(to_name + '.zip', 'w') as zoutfile:
for filename in cmpresult.to_add:
with zinfile.open(filename) as infile, \
zoutfile.open(filename, 'w') as outfile:
shutil.copyfileobj(infile, outfile)
with open(to_name + '.sh', 'w', encoding='utf-8') as outfile:
outfile.write('#!/bin/sh\n')
for filename in cmpresult.to_rm:
outfile.write('rm {}\n'.format(shlex.quote(filename)))
def main():
args = make_parser().parse_args()
cmpresult = zipcmp(args.oldfile, args.newfile)
compare_files(args.oldfile, args.newfile, cmpresult)
mkpatch(args.newfile, cmpresult, args.toname)
if __name__ == '__main__':
main()
Upvotes: 1
Reputation: 1858
Rather than comparing the files directly, I would go ahead and compare hashed values of the files. This eliminates the dependency of filecmp
, which might -as you said - not work with zipped files.
import hashlib
def compare_files(a,b):
fileA = hashlib.sha256(open(a, 'rb').read()).digest()
fileB = hashlib.sha256(open(b, 'rb').read()).digest()
if fileA == fileB:
return True
else:
return False
Upvotes: 9