Reputation: 140
I want to compare two large files(5GB+) and find if they are the same or not. One solution I considered is hashing both with crypto and then comparing the hashes. But this would take a lot of time since I will have to go through the entire files instead of stopping when a difference is found.
Another solution I thought was to compare the file as they are being streamed with fs.createReadStream()
and break when a difference is found.
stream.on('data', (data) => {
//compare the data from this stream with the other stream
})
But I am not quite sure how I can have two streams that are synchronized.
Upvotes: 4
Views: 4387
Reputation: 708206
As requested in your comments, if you want to see how an implementation can be written to do this, here's one. Here's how it works:
Here's the code:
const fs = require('fs');
const fsp = fs.promises;
// resolves to true or false
async function compareFiles(fname1, fname2) {
const kReadSize = 1024 * 8;
let h1, h2;
try {
h1 = await fsp.open(fname1);
h2 = await fsp.open(fname2);
const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
if (stat1.size !== stat2.size) {
return false;
}
const buf1 = Buffer.alloc(kReadSize);
const buf2 = Buffer.alloc(kReadSize);
let pos = 0;
let remainingSize = stat1.size;
while (remainingSize > 0) {
let readSize = Math.min(kReadSize, remainingSize);
let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
throw new Error("Failed to read desired number of bytes");
}
if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
return false;
}
remainingSize -= readSize;
pos += readSize;
}
return true;
} finally {
if (h1) {
await h1.close();
}
if (h2) {
await h2.close();
}
}
}
// sample usage
compareFiles("temp.bin", "temp2.bin").then(result => {
console.log(result);
}).catch(err => {
console.log(err);
});
This could be sped up a bit by opening and closing the files in parallel using Promise.allSettled()
to track when they are both open and then both closed, though because of the complications if one succeeds in opening and the other doesn't and you don't want to leak the one opened file handle, it takes a bit more code to do that perfectly so I kept it simpler here.
And, if you really wanted to optimize for performance, it would be worth testing larger buffers to see if it makes things faster or not.
It's also possible that buf1.equals(buf2)
might be faster than buf1.compare(buf2)
, but you have to make sure that a partial buffer read at the end of the file still works properly when using that since .equals()
always compares the entire buffer. You could build two versions and compare their performance.
Here's a more complicated version that opens and closes the files in parallel and might be slightly faster:
const fs = require('fs');
const fsp = fs.promises;
async function compareFiles(fname1, fname2) {
const kReadSize = 1024 * 8;
let h1, h2;
try {
let openResults = await Promise.allSettled([fsp.open(fname1), fsp.open(fname2)]);
let err;
if (openResults[0].status === "fulfilled") {
h1 = openResults[0].value;
} else {
err = openResults[0].reason;
}
if (openResults[1].status === "fulfilled") {
h2 = openResults[1].value;
} else {
err = openResults[1].reason;
}
// after h1 and h2 are set (so they can be properly closed)
// throw any error we got
if (err) {
throw err;
}
const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
if (stat1.size !== stat2.size) {
return false;
}
const buf1 = Buffer.alloc(kReadSize);
const buf2 = Buffer.alloc(kReadSize);
let pos = 0;
let remainingSize = stat1.size;
while (remainingSize > 0) {
let readSize = Math.min(kReadSize, remainingSize);
let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
throw new Error("Failed to read desired number of bytes");
}
if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
return false;
}
remainingSize -= readSize;
pos += readSize;
}
return true;
} finally {
// does not return file close errors
// but does hold resolving the promise until the files are closed
// or had an error trying to close them
// Since we didn't write to the files, a close error would be fairly
// unprecedented unless the disk went down
const closePromises = [];
if (h1) {
closePromises.push(h1.close());
}
if (h2) {
closePromises.push(h2.close());
}
await Promise.allSettled(closePromises);
}
}
compareFiles("temp.bin", "temp2.bin").then(result => {
console.log(result);
}).catch(err => {
console.log(err);
});
Upvotes: 9
Reputation: 151896
There are certainly libraries that do this, and file-sync-cmp is very popular (270k weekly downloads). It does the comparison in the simplest way, by reading the same number of bytes from the two files in different buffers, and then comparing the buffers byte by byte.
There's also a more modern library, filecompare, "using native Promises and native BufferTools (alloc and Buffer comparisons)".
Whenever practical, don't reinvent the wheel :)
Upvotes: 2
Reputation: 668
Since the difference might be at the very end of the files, I guess calculating a hash of the files is the most (yet costly) straightforward and secure process.
Did you try the MD5-File npm package and get some performance indicators?
Upvotes: 0