Reputation: 6753
I have written the below code for LCS. It works for many cases but breaks for the one below. I do not understand where my code is breaking. Please help. The code is in C#
namespace LongestCommonSubsequenceBF
{
class Program
{
static void Main(string[] args)
{
string B = "AAACCGTGAGTTATTCGTTCTAGAA";
string A = "CACCCCTAAGGTACCTTTGGTTC";
//find LCS in A,B starting from index 0 of each
int longestCommonSubsequence = LCS(A, B, 0, 0);
Console.WriteLine(longestCommonSubsequence);
Console.Read();
}
//Find the longest common subsequnce starting from index1 in A and index2 in B
//Pass A as shorter string
public static int LCS(String A, String B, int index1, int index2)
{
int max = 0;
if (index1 == A.Length)
{
//You have reached beyond A and thus no subsequence
return 0;
}
if (index2 == B.Length)
{ //you may reach end of 2nd string. LCS from that end is 0
return 0;
}
for (int i = index1; i < A.Length ; i++)
{
int exist = B.IndexOf(A[i],index2);
if (exist != -1)
{
// found = true;
int temp = 1 + LCS(A, B, i + 1, exist + 1);
if (max < temp)
{
max = temp;
}
}
}
return max;
}
}
}
Upvotes: 8
Views: 11590
Reputation: 3631
Based on the comments seen under the accepted answer, looks like there's an interest in seeing how the sequence is created. That's why I decided to add this answer.
This is a complete example using C# 12
in .NET 8
. And tested using NUnit 3
.
Let's start with a simple example. Find Longest Common Subsequence between words: FISH and FOSH.
We only really care about how to fill cells from (1,1)
to (word1.Length,word2.Length)
and basic idea of filling those cells is:
(value of top-left neighbor) + 1
. And also store this character in this cell.MAX(value of top neighbor, value of left neighbor)
.word2.len+1 cols 👉 word1.len+1 rows 👇 |
col=0 | col=1 | col=2 | col=3 | col=4 word2 len=4 |
|
---|---|---|---|---|---|---|
Word2 👉 Word1👇 |
F | O | S | H | ||
row=0 | 0 (0,0) null |
0 (0,1) null |
0 (0,2) null |
0 (0,3) null |
0 (0,4) null |
|
row=1 | F | 0 (1,0) null |
1 (1,1) 'F' |
1 (1,2) null |
1 (1,3) null |
1 (1,4) null |
row=2 | I | 0 (2,0) null |
1 (2,1) null |
1 (2,2) null |
1 (2,3) null |
1 (2,4) null |
row=3 | S | 0 (3,0) null |
1 (3,1) null |
1 (3,2) null |
2 (3,3) 'S' |
2 (3,4) null |
row=4 word1 len=4 |
H | 0 (4,0) null |
1 (4,1) null |
1 (4,2) null |
2 (4,3) null |
3 (4,4) 'H' |
The basic idea is:
This gives the sequence. In this case, it's "FSH".
(4,4) 👉 (3,3) 👉 (2,2) 👉 (2,1) 👉 (1,1)
namespace Algorithms.DynamicProgramming;
public record Cell(int Value, char? Character);
public static class LongestCommonSubsequence
{
public static Cell[,] Find(string word1, string word2)
{
var grid = new Cell[word1.Length + 1, word2.Length + 1];
for (var row = 0; row <= word1.Length; row++)
{
// First row and first column are padded with zeroes, so we put nullable char in those places
char? currentRowLetter = row == 0 ? null : word1[row - 1];
for (var col = 0; col <= word2.Length; col++)
{
char? currentColumnLetter = col == 0 ? null : word2[col - 1];
// Create the padding, so something like: grid[row - 1, col - 1] won't get you into Index out of bounds exceptions
if (row == 0 || col == 0)
{
grid[row, col] = new Cell(0, null);
}
else if (currentRowLetter == currentColumnLetter)
{
var topLeftNeighbor = grid[row - 1, col - 1];
grid[row, col] = new Cell(topLeftNeighbor.Value + 1, currentRowLetter);
}
else
{
var topNeighbor = grid[row - 1, col];
var leftNeighbor = grid[row, col - 1];
grid[row, col] = new Cell(Math.Max(topNeighbor.Value, leftNeighbor.Value), null);
}
}
}
return grid;
}
// Backtracking the grid
public static string GetLcSubsequence(Cell[,] grid)
{
var lcs = new Stack<char>();
// Only go through the words' length portion of the original grid. For eg: (word1.Length,word2.Length) to (1,1)
// Don't go to the padded row/ column, that's why -1
var rows = grid.GetLength(0) - 1;
var cols = grid.GetLength(1) - 1;
while (rows > 0 && cols > 0)
{
// If you get a character, go towards diagonal
if (grid[rows, cols].Character.HasValue)
{
lcs.Push(grid[rows, cols].Character!.Value);
rows--;
cols--;
}
// Otherwise, just go towards the left or top neighbor (whichever is bigger)
else
{
if (grid[rows - 1, cols].Value > grid[rows, cols - 1].Value)
rows--; // If the top neighbor:grid[row - 1, col] is bigger, go towards it
else
cols--; // If the left neighbor:grid[row, col - 1] is bigger, go towards it
}
}
return new string(lcs.ToArray());
}
}
using NUnit.Framework;
using Algorithms.DynamicProgramming;
namespace Algorithms.UnitTests.DynamicProgramming;
public class LongestCommonSubsequenceTests
{
[Test]
public void LCS_Finds_Longest_Common_Subsequence()
{
// Arrange
const string word1 = "AAACCGTGAGTTATTCGTTCTAGAA";
const string word2 = "CACCCCTAAGGTACCTTTGGTTC";
// Act
var grid = LongestCommonSubsequence.Find(word1, word2);
var subsequence = LongestCommonSubsequence.GetLcSubsequence(grid);
var subsequenceLength = grid.Cast<Cell>().MaxBy(c => c.Value)!.Value;
// Assert
Assert.That(subsequence, Is.EqualTo("ACCTAGTATTGTTC"));
Assert.That(subsequenceLength, Is.EqualTo(14));
}
}
Upvotes: 0
Reputation: 172220
Why do you think your algorithm is broken? The longest common subsequence is ACCTAGTATTGTTC
, which is 14 characters long:
string B = "AAACCGTGAGTTATTCGTTCTAGAA";
^^^ ^ ^^ ^^^^ ^^^^
string A = "CACCCCTAAGGTACCTTTGGTTC";
^^^ ^ ^^ ^^ ^^ ^ ^^^
(I modified your algorithm to return the sequence instead of just the length.)
Upvotes: 9