Rafael
Rafael

Reputation: 140

Best practice for comparing two large files in Node.js

I want to compare two large files(5GB+) and find if they are the same or not. One solution I considered is hashing both with crypto and then comparing the hashes. But this would take a lot of time since I will have to go through the entire files instead of stopping when a difference is found.
Another solution I thought was to compare the file as they are being streamed with fs.createReadStream() and break when a difference is found.

stream.on('data', (data) => {
   //compare the data from this stream with the other stream
})

But I am not quite sure how I can have two streams that are synchronized.

Upvotes: 4

Views: 4387

Answers (3)

jfriend00
jfriend00

Reputation: 708206

As requested in your comments, if you want to see how an implementation can be written to do this, here's one. Here's how it works:

  1. Open each of the two files
  2. Compare the two files sizes. If not the same, resolve false.
  3. Allocate two 8k buffers (you can choose the size of buffer to use)
  4. Read 8k of each file (or less if not 8k left in the file) into your buffers
  5. Compare those two buffers. If not identical, resolve false.
  6. When you finish comparing all the bytes, resolve true

Here's the code:

const fs = require('fs');
const fsp = fs.promises;

// resolves to true or false
async function compareFiles(fname1, fname2) {
    const kReadSize = 1024 * 8;
    let h1, h2;
    try {
        h1 = await fsp.open(fname1);
        h2 = await fsp.open(fname2);
        const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
        if (stat1.size !== stat2.size) {
            return false;
        }
        const buf1 = Buffer.alloc(kReadSize);
        const buf2 = Buffer.alloc(kReadSize);
        let pos = 0;
        let remainingSize = stat1.size;
        while (remainingSize > 0) {
            let readSize = Math.min(kReadSize, remainingSize);
            let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
            if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
                throw new Error("Failed to read desired number of bytes");
            }
            if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
                return false;
            }
            remainingSize -= readSize;
            pos += readSize;
        }
        return true;
    } finally {
        if (h1) {
            await h1.close();
        }
        if (h2) {
            await h2.close();
        }
    }
}

// sample usage
compareFiles("temp.bin", "temp2.bin").then(result => {
    console.log(result);
}).catch(err => {
    console.log(err);
});

This could be sped up a bit by opening and closing the files in parallel using Promise.allSettled() to track when they are both open and then both closed, though because of the complications if one succeeds in opening and the other doesn't and you don't want to leak the one opened file handle, it takes a bit more code to do that perfectly so I kept it simpler here.

And, if you really wanted to optimize for performance, it would be worth testing larger buffers to see if it makes things faster or not.

It's also possible that buf1.equals(buf2) might be faster than buf1.compare(buf2), but you have to make sure that a partial buffer read at the end of the file still works properly when using that since .equals() always compares the entire buffer. You could build two versions and compare their performance.


Here's a more complicated version that opens and closes the files in parallel and might be slightly faster:

const fs = require('fs');
const fsp = fs.promises;

async function compareFiles(fname1, fname2) {
    const kReadSize = 1024 * 8;
    let h1, h2;
    try {
        let openResults = await Promise.allSettled([fsp.open(fname1), fsp.open(fname2)]);
        let err;
        if (openResults[0].status === "fulfilled") {
            h1 = openResults[0].value;
        } else {
            err = openResults[0].reason;
        }
        if (openResults[1].status === "fulfilled") {
            h2 = openResults[1].value;
        } else {
            err = openResults[1].reason;
        }
        // after h1 and h2 are set (so they can be properly closed)
        // throw any error we got
        if (err) {
            throw err;
        }

        const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
        if (stat1.size !== stat2.size) {
            return false;
        }
        const buf1 = Buffer.alloc(kReadSize);
        const buf2 = Buffer.alloc(kReadSize);
        let pos = 0;
        let remainingSize = stat1.size;
        while (remainingSize > 0) {
            let readSize = Math.min(kReadSize, remainingSize);
            let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
            if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
                throw new Error("Failed to read desired number of bytes");
            }
            if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
                return false;
            }
            remainingSize -= readSize;
            pos += readSize;
        }
        return true;
    } finally {
        // does not return file close errors
        // but does hold resolving the promise until the files are closed
        // or had an error trying to close them
        // Since we didn't write to the files, a close error would be fairly 
        // unprecedented unless the disk went down
        const closePromises = [];
        if (h1) {
            closePromises.push(h1.close());
        }
        if (h2) {
            closePromises.push(h2.close());
        }
        await Promise.allSettled(closePromises);
    }
}

compareFiles("temp.bin", "temp2.bin").then(result => {
    console.log(result);
}).catch(err => {
    console.log(err);
});

Upvotes: 9

Dan Dascalescu
Dan Dascalescu

Reputation: 151896

There are certainly libraries that do this, and file-sync-cmp is very popular (270k weekly downloads). It does the comparison in the simplest way, by reading the same number of bytes from the two files in different buffers, and then comparing the buffers byte by byte.

There's also a more modern library, filecompare, "using native Promises and native BufferTools (alloc and Buffer comparisons)".

Whenever practical, don't reinvent the wheel :)

Upvotes: 2

Olivier Lépine
Olivier Lépine

Reputation: 668

Since the difference might be at the very end of the files, I guess calculating a hash of the files is the most (yet costly) straightforward and secure process.

Did you try the MD5-File npm package and get some performance indicators?

Upvotes: 0

Related Questions