Reputation:
I've got a funky bug that's driving me nuts. Can anyone help me find it? Try calling the function with two words that differ only by a missing last character ("garble" vs "garbl"). The function is returning 0 instead of the expected 1. It's supposed to return 1, right?
I've tried fiddling with the array bounds but that's only causing IndexOutOfRangeExceptions
.
public static class FuzzyStringMatcher
{
private const int DELETION = 0;
private const int INSERTION = 1;
private const int SUBSTITUTION = 2;
private const int TRANSPOSITION = 3;
private const int COST_OF_DELETION = 1;
private const int COST_OF_INSERTION = 1;
private const int COST_OF_TRANSPOSITION = 1;
private const int COST_OF_SUBSTITUTION = 1;
public static int Compute_DamerauLevenshtein_Distance(string a, string b)
{
int[,] rows = new int[a.Length + 1, b.Length + 1];
int cost_ratio;
int[] calculations = new int[4];
//
// Init the array
//
for (int i = 0; i < rows.GetUpperBound(0); i++)
rows[i, 0] = i;
for (int i = 0; i < rows.GetUpperBound(1); i++)
rows[0, i] = i;
for (int aidx = 1; aidx < rows.GetUpperBound(0); aidx++)
{
for (int bidx = 1; bidx < rows.GetUpperBound(1); bidx++)
{
if (a[aidx - 1] == b[bidx - 1])
cost_ratio = 0;
else
cost_ratio = 1;
calculations[DELETION] = rows[aidx - 1, bidx] + COST_OF_DELETION;
calculations[INSERTION] = rows[aidx, bidx - 1] + COST_OF_INSERTION;
calculations[SUBSTITUTION] = rows[aidx - 1, bidx - 1] + cost_ratio * COST_OF_SUBSTITUTION;
calculations[TRANSPOSITION] = int.MaxValue;
if (aidx > 1 && bidx > 1 && a[aidx] == b[bidx - 1] && a[aidx - 1] == b[bidx])
calculations[TRANSPOSITION] = rows[aidx - 2, bidx - 2] + cost_ratio * COST_OF_TRANSPOSITION;
rows[aidx, bidx] = calculations.Min();
}
}
int score = rows[rows.GetUpperBound(0) - 1, rows.GetUpperBound(1) - 1];
if (a.Contains(b) || b.Contains(a))
score = score / 2;
return score;
}
}
My implementation is based off the algorithm given in the Wikipedia page on Damerau-Levenshtein-Distance
Upvotes: 2
Views: 500
Reputation: 89222
This isn't in the Wikipedia article:
if (a.Contains(b) || b.Contains(a))
score = score / 2;
Since it's true for your example -- and integer division of 1/2 == 0, then that could be it.
Upvotes: 2
Reputation: 34427
+1 to Lou Franco. But beside that, it seems like you have lots of index issues (note that all 4 for
cycles in wiki sample are inclusive, and when 1 is subtracted from aidx/bidx you actually need to subtract 2 because in wiki sample indexes in strings start at 1). My version:
public static int Compute_DamerauLevenshtein_Distance2(string a, string b)
{
int[,] rows = new int[a.Length + 1, b.Length + 1];
int cost_ratio;
int[] calculations = new int[4];
for(int i = 0; i <= rows.GetUpperBound(0); i++)
rows[i, 0] = i;
for(int i = 1; i <= rows.GetUpperBound(1); i++)
rows[0, i] = i;
for(int aidx = 1; aidx <= rows.GetUpperBound(0); aidx++)
{
for(int bidx = 1; bidx <= rows.GetUpperBound(1); bidx++)
{
if(a[aidx - 1] == b[bidx - 1])
cost_ratio = 0;
else
cost_ratio = 1;
calculations[DELETION] = rows[aidx - 1, bidx] + COST_OF_DELETION;
calculations[INSERTION] = rows[aidx, bidx - 1] + COST_OF_INSERTION;
calculations[SUBSTITUTION] = rows[aidx - 1, bidx - 1] + cost_ratio * COST_OF_SUBSTITUTION;
calculations[TRANSPOSITION] = int.MaxValue;
if(aidx > 1 && bidx > 1 && a[aidx - 1] == b[bidx - 2] && a[aidx - 2] == b[bidx - 1])
calculations[TRANSPOSITION] = rows[aidx - 2, bidx - 2] + cost_ratio * COST_OF_TRANSPOSITION;
rows[aidx, bidx] = calculations.Min();
}
}
int score = rows[rows.GetUpperBound(0), rows.GetUpperBound(1)];
return score;
}
Upvotes: 3