mpen
mpen

Reputation: 282875

Fast checksum hashing?

Writing a simple program that will find exact duplicate files on my computer, but it's a bit slow. Is there any way I can speed this up?

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;

namespace DupeFinder
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Getting files...");
            var files = Directory.GetFiles(@"D:\Photos", "*", SearchOption.AllDirectories);
            var alg = new HMACMD5();
            var dict = new Dictionary<string,List<string>>();
            Console.WriteLine("Computing hashes...");
            int i=0;
            int cursorLeft = Console.CursorLeft;
            int cursorTop = Console.CursorTop;
            foreach(var fileName in files)
            {
                Console.SetCursorPosition(cursorLeft,cursorTop);
                Console.Write("Hashing file {0}/{1}", ++i, files.Length);
                using(var stream = new BufferedStream(File.OpenRead(fileName),1024*1024*5))
                {
                    var hash = alg.ComputeHash(stream);
                    var str = BitConverter.ToString(hash);
                    if (!dict.ContainsKey(str)) dict[str] = new List<string>();
                    dict[str].Add(fileName);
                }
            }
            Console.WriteLine();

            foreach(var dupe in dict.Where(p => p.Value.Count >= 2))
            {
                Console.WriteLine(string.Join(", ", dupe.Value));
            }

            Console.WriteLine("Done!");
            Console.ReadLine();
        }
    }
}

Possible optimizations:

  1. Avoid converting byte array to string first. I tried this, but it doesn't work, I guess because it's using reference equals rather than comparing the bytes.
  2. Faster hashing algorithm?
  3. Different stream, or buffer size? I tried doing 1024^3 which should be a megabyte, but that seemed to slow it down if anything.

Or is this just an inherently slow thing to do?


I remembered that that Dictionaries can accept an IEqualityComparer, so I can write my own byte[] comparer.

A lot of the algorithms I found on the internet tend to compare the byte length first, which I don't need to do, because I know it's always going to be 16 bytes. They also tend to compare 1 byte at a time... but I'm on a 64-bit machine, so why not do 8?

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;

namespace DupeFinder
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Getting files...");
            string dir = @"D:\Photos";
            var files = Directory.GetFiles(dir, "*", SearchOption.AllDirectories);
            var alg = new HMACMD5();
            var dict = new Dictionary<byte[], List<string>>(new Md5Comparer());
            Console.WriteLine("Computing hashes...");
            int i = 0;
            int cursorLeft = Console.CursorLeft;
            int cursorTop = Console.CursorTop;
            foreach (var fileName in files)
            {
                Console.SetCursorPosition(cursorLeft, cursorTop);
                Console.Write("Hashing file {0}/{1}", ++i, files.Length);
                using (var stream = new BufferedStream(File.OpenRead(fileName), 1024 * 1024 * 5))
                {
                    var hash = alg.ComputeHash(stream);
                    if (!dict.ContainsKey(hash)) dict[hash] = new List<string>();
                    dict[hash].Add(fileName);
                }
            }
            Console.WriteLine();

            using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
            {
                i = 0;
                foreach (var dupe in dict.Where(p => p.Value.Count >= 2))
                {
                    sw.WriteLine("Duplicate {0}", ++i);
                    foreach(var fn in dupe.Value)
                    {
                        sw.WriteLine("- {0}", fn);
                    }
                }
            }

            Console.WriteLine("Done!");
            //Console.ReadLine();
        }
    }

    class Md5Comparer : IEqualityComparer<byte[]>
    {
        public bool Equals(byte[] x, byte[] y)
        {
            var xi = BitConverter.ToInt64(x, 0);
            var yi = BitConverter.ToInt64(y, 0);
            if (xi != yi) return false;
            xi = BitConverter.ToInt64(x, 8);
            yi = BitConverter.ToInt64(y, 8);
            return xi == yi;
        }

        public int GetHashCode(byte[] obj)
        {
            return obj[0];
        }
    }
}

Not sure how much faster this is.... I haven't done any benchmarking, but it certainly doesn't seem any slower.


New code, thanks to @spender:

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;

namespace DupeFinder
{
    class Program
    {
        static void Main(string[] args)
        {
            var watch = Stopwatch.StartNew();
            const string dir = @"D:\Photos";
            var md5Comparer = new Md5Comparer();

            var dupeGroups = Directory.EnumerateFiles(dir, "*", SearchOption.AllDirectories)
                .Select(fn => new FileInfo(fn))
                .GroupBy(fi => fi.Length)
                .Where(g => g.Count() > 1)
                .SelectMany(g => g
                   .GroupBy(fi => GetHash(fi.FullName), md5Comparer)
                   .Where(g2 => g2.Count() > 1));

            using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
            {
                int i = 0;
                foreach (var dupeGroup in dupeGroups)
                {
                    sw.WriteLine("Duplicate {0}", ++i);
                    foreach(FileInfo fi in dupeGroup)
                    {
                        sw.WriteLine("- {0}", fi.FullName);
                    }
                }
            }

            Console.WriteLine("{0:0.000} seconds", watch.ElapsedMilliseconds / 1000d); // 22.068 seconds to process 10K files, 37 GB, 463 dupes
            Console.ReadLine();
        }

        static readonly HMACMD5 md5Hasher = new HMACMD5();
        public static byte[] GetHash(string fileName)
        {
            using(var stream = File.OpenRead(fileName))
                return md5Hasher.ComputeHash(stream);
        }
    }

    class Md5Comparer : IEqualityComparer<byte[]>
    {
        public bool Equals(byte[] x, byte[] y)
        {
            var xi = BitConverter.ToInt64(x, 0);
            var yi = BitConverter.ToInt64(y, 0);
            if (xi != yi) return false;
            xi = BitConverter.ToInt64(x, 8);
            yi = BitConverter.ToInt64(y, 8);
            return xi == yi;
        }

        public int GetHashCode(byte[] obj)
        {
            return obj[0];
        }
    }
}

Down to 22-70 seconds from 360+ seconds. Quite the improvement!

Upvotes: 3

Views: 3541

Answers (1)

spender
spender

Reputation: 120450

You missed a massive opto: If sizes don't match...

Besides, matching hashes does not guarantee matching contents.

EDIT

Ok. It's not that hard to find size based dupes which you can then check to see if they are really dupes:

For instance:

var files = 
    Directory
        .GetFiles(
            @"C:\Users\Administrator\Downloads",
            "*", 
            SearchOption.AllDirectories);
var fileGroups=
    files
        .Select(fn => new FileInfo(fn))
        .GroupBy(fi => fi.Length)
        .Where(g => g.Count()>1);

You could take it further by making a hashing function for a given filename

int GetHash(fileName)  //fill in your definition

Then...

var fg2 =
        fileGroups
            .SelectMany(
                g => g
                    .GroupBy(fi => GetHash(fi.FullName))
                    .Where(gg=>gg.Count()>1));

foreach(var dupeGroup in fg2)
{
    //everything in this group is probably duplicate
    //although you'd need to do byte by byte to be sure
    foreach(FileInfo dupe in dupeGroup)
    {

    }
}

By doing this, you're going to masively reduce the amount of hashing required because you've prefiltered the candidates by size.

Upvotes: 5

Related Questions