Reputation: 1259
I have a program that I am writing in Java and have to do 2 things, find the longest common sub-sequence and align the common characters. The LCS works just fine but the align part just loops away or do nothing.
I try to do this algorithm which I found on Wikipedia
function printDiff(C[0..m,0..n], X[1..m], Y[1..n], i, j)
if i > 0 and j > 0 and X[i] = Y[j]
printDiff(C, X, Y, i-1, j-1)
print " " + X[i]
else if j > 0 and (i = 0 or C[i,j-1] ≥ C[i-1,j])
printDiff(C, X, Y, i, j-1)
print "+ " + Y[j]
else if i > 0 and (j = 0 or C[i,j-1] < C[i-1,j])
printDiff(C, X, Y, i-1, j)
print "- " + X[i]
else
print ""
Here is the code I wrote (I removed the LCS part)
static char[] input1 = "ABCDE".toCharArray();
static char[] input2 = "ACDC".toCharArray();
static int M = input1.length;
static int N = input2.length;
static int[][] opt = new int[M + 1][N + 1];
public static void printDiff(int opt[][], char input1[], char input2[]) {
int i = 0, j = 0;
while (i < input1.length && j < input2.length) {
if (i > 0 && j > 0 && input1[i] == input2[j]) {
System.out.print(" " + input1[i]);
i++;
j++;
} else if (j > 0 && (i == 0 || opt[i][j - 1] >= opt[i - 1][j])) {
System.out.print("+ " + input2[j]);
j++;
} else if (i > 0 && (j == 0 || opt[i][j - 1] < opt[i - 1][j])) {
System.out.print("- " + input1[i]);
i++;
} else {
System.out.print("");
}
}
}
Upvotes: 0
Views: 1399
Reputation: 3451
Here is a version which returns the diffs of all the longest common subsequences (basically backtracks using the cached table - similar to the approach taken to get to all longest common subsequences in All Longest Common Subsequences) (or, you may refer to my blog@: http://codingworkout.blogspot.com/2014/07/longest-common-subsequence.html)
for ex, for GAC and AGCAT, it returns => { { "[G][A]C", "[G]A[C]", "G[A][C]" }, {"A[G]C[A]T", "A[G][C]AT", "[A]G[C]AT"} where GA, GC and AC are longest common subsequences...
string[][] GetDiffs(string A, string B, int aIndex, int bIndex,
int[][] DP_LCS_AllPrefixes_Cache)
{
if((aIndex == 0) && (bIndex ==0))
{
return null;
}
if (DP_LCS_AllPrefixes_Cache[aIndex][bIndex] == 0)
{
var r = new string[2][];
r[0] = new string[] { A.Substring(0, aIndex) };
r[1] = new string[] { B.Substring(0, bIndex) };
return r;
}
if (A[aIndex - 1] == B[bIndex - 1])
{
var r = this.GetDiffs(A, B, aIndex - 1, bIndex - 1,
DP_LCS_AllPrefixes_Cache);
string ch = string.Format("[{0}]", A[aIndex - 1]);
if (r == null)
{
r = new string[2][];
r[0] = new string[] { ch };
r[1] = new string[] { ch };
}
else
{
r[0] = r[0].Select(s => s + ch).ToArray();
r[1] = r[1].Select(s => s + ch).ToArray();
}
return r;
}
int lcs_up_direction = DP_LCS_AllPrefixes_Cache[aIndex - 1][bIndex];
int lcs_left_direction = DP_LCS_AllPrefixes_Cache[aIndex][bIndex - 1];
string[][] lcs_up = null, lcs_left = null;
if (lcs_up_direction == lcs_left_direction)
{
lcs_up = this.GetDiffs(A, B, aIndex - 1, bIndex,
DP_LCS_AllPrefixes_Cache);
lcs_left = this.GetDiffs(A, B, aIndex, bIndex - 1,
DP_LCS_AllPrefixes_Cache);
}
else if (lcs_up_direction > lcs_left_direction)
{
lcs_up = this.GetDiffs(A, B, aIndex - 1, bIndex,
DP_LCS_AllPrefixes_Cache);
}
else
{
lcs_left = this.GetDiffs(A, B, aIndex, bIndex - 1, DP_LCS_AllPrefixes_Cache);
}
char a = A[aIndex - 1], b = B[bIndex - 1];
string[][] rl = new string[2][];
rl[0] = new string[0];
rl[1] = new string[0];
if(lcs_up != null)
{
//we moved upward, that is we accepted that they differ with 'A' at aIndex-1 (a)
rl[0] = lcs_up[0].Select(s => s + a.ToString()).ToArray();
rl[1] = lcs_up[1];
}
if (lcs_left != null)
{
//we moved left, that is we accepted that they differ with 'B' at bIndex-1 (b)
rl[0] = rl[0].Union(lcs_left[0]).ToArray(); ;
rl[1] = rl[1].Union(lcs_left[1].Select(s => s + b.ToString())).ToArray();
}
return rl.ToArray();
}
where the caller is
string[][] GetDiffs(string A, string B, int[][] DP_LCS_AllPrefixes_Cache)
{
var r = this.GetDiffs(A, B, A.Length, B.Length,
DP_LCS_AllPrefixes_Cache);
return r;
}
And the DP method which captures LCS lengths to backtrack
public int[][] LCS_OfAllPrefixes_Length(string A, string B)
{
A.ThrowIfNullOrWhiteSpace("a");
B.ThrowIfNullOrWhiteSpace("b");
int[][] DP_LCS_AllPrefixes_Cache = new int[A.Length+1][];
for(int i = 0;i<DP_LCS_AllPrefixes_Cache.Length; i++)
{
DP_LCS_AllPrefixes_Cache[i] = new int[B.Length + 1];
}
for (int rowIndexOfCache = 1; rowIndexOfCache <= A.Length; rowIndexOfCache++)
{
for (int columnIndexOfCache = 1; columnIndexOfCache <= B.Length; columnIndexOfCache++)
{
//LCS(Ai, Bj) = 0 if i <=0, or j <= 0
// LCS(Ai, Bj) + 1 if Ai == Bj
// Max(LCS(Ai-1, Bj), LCS(Ai, Bj-1))
if(A[rowIndexOfCache-1] == B[columnIndexOfCache-1])
{
DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache] = DP_LCS_AllPrefixes_Cache[rowIndexOfCache - 1][columnIndexOfCache - 1] + 1;
}
else
{
DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache] = Utilities.Max(DP_LCS_AllPrefixes_Cache[rowIndexOfCache - 1][columnIndexOfCache],
DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache - 1]);
}
}
}
return DP_LCS_AllPrefixes_Cache;
}
TestMethod
[TestMethod]
public void LCS_Tests()
{
string A = "GAC", B = "AGCAT";
var DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 2);
var lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(lcs_sequences);
var diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(diffs);
Assert.IsTrue(diffs.Length == 2);
Assert.IsTrue(diffs[0].Length == lcs_sequences.Length);
Assert.IsTrue(diffs[1].Length == lcs_sequences.Length);
Assert.IsTrue(lcs_sequences.Any(s => "AC".Equals(s)));
Assert.IsTrue(lcs_sequences.Any(s => "GC".Equals(s)));
Assert.IsTrue(lcs_sequences.Any(s => "GA".Equals(s)));
var DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 2);
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
.Any(s => "AC".Equals(s)));
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
.Any(s => "GC".Equals(s)));
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
.Any(s => "GA".Equals(s)));
A = "ABCDGH"; B = "AEDFHR";
DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 3);
lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(lcs_sequences);
diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(diffs);
Assert.IsTrue(diffs.Length == 2);
Assert.IsTrue(diffs[0].Length == lcs_sequences.Length);
Assert.IsTrue(diffs[1].Length == lcs_sequences.Length);
Assert.IsTrue(lcs_sequences.Any(s => "ADH".Equals(s)));
DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 3);
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
.Any(s => "ADH".Equals(s)));
A = "AGGTAB"; B = "GXTXAYB";
DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 4);
lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(lcs_sequences);
diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(diffs);
Assert.IsTrue(diffs.Length == 2);
Assert.IsTrue(diffs[0].Length == 2);
Assert.IsTrue(diffs[1].Length == lcs_sequences.Length);
Assert.IsTrue(lcs_sequences.Any(s => "GTAB".Equals(s)));
DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 4);
Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
.Any(s => "GTAB".Equals(s)));
A = "ABCDEF"; B = "UVWXYZ";
DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 0);
lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
Assert.IsNotNull(diffs);
Assert.IsTrue(diffs.Length == 2);
Assert.IsTrue(diffs[0].Length == 1);
Assert.IsTrue(diffs[1].Length == 1);
}
Upvotes: 1
Reputation: 51565
I rewrote your code to use the Wikipedia algorithm. In other words, I used recursion rather than a where clause. I had to change one of the if conditions because Java is zero index based and the Wikipedia algorithm is one index based.
I had to add the LCS function back in so that I could calculate the int[][]opt
.
I added parenthesis to the if statements to make sure that the operations were done in the order I wanted them done.
I also fixed the output. The Wikipedia algorithm had "+ "
and "- "
as output. That appears to be a typo. The output should be " +"
and " -"
, respectively.
Here's my version of the code.
public class PrintDiff {
char[] input1 = "ABCDE".toCharArray();
char[] input2 = "ACDC".toCharArray();
int M = input1.length;
int N = input2.length;
public void run() {
int[][] opt = lcsLength(input1, input2);
printDiff(opt, input1, input2, M - 1, N - 1);
}
public int[][] lcsLength(char[] input1, char[] input2) {
int[][] opt = new int[M][N];
for (int i = 1; i < input1.length; i++) {
for (int j = 1; j < input2.length; j++) {
if (input1[i] == input2[j]) {
opt[i][j] = opt[i - 1][j - 1] + 1;
} else {
opt[i][j] = Math.max(opt[i][j - 1], opt[i - 1][j]);
}
}
}
return opt;
}
public void printDiff(int opt[][], char input1[], char input2[], int i,
int j) {
if ((i >= 0) && (j >= 0) && (input1[i] == input2[j])) {
printDiff(opt, input1, input2, i - 1, j - 1);
System.out.print(" " + input1[i]);
} else if ((j > 0) && ((i == 0) || (opt[i][j - 1] >= opt[i - 1][j]))) {
printDiff(opt, input1, input2, i, j - 1);
System.out.print(" +" + input2[j]);
} else if ((i > 0) && ((j == 0) || (opt[i][j - 1] < opt[i - 1][j]))) {
printDiff(opt, input1, input2, i - 1, j);
System.out.print(" -" + input1[i]);
} else {
System.out.print("");
}
}
public static void main(String[] args) {
new PrintDiff().run();
}
}
And here's my output.
A -B C D -E +C
Upvotes: 2