Programmer
Programmer

Reputation: 6753

Longest Common Subsequence

I have written the below code for LCS. It works for many cases but breaks for the one below. I do not understand where my code is breaking. Please help. The code is in C#

namespace LongestCommonSubsequenceBF
{
class Program
{
    static void Main(string[] args)
    {

        string B = "AAACCGTGAGTTATTCGTTCTAGAA";
        string A = "CACCCCTAAGGTACCTTTGGTTC";
        //find LCS in A,B starting from index 0 of each
        int longestCommonSubsequence = LCS(A, B, 0, 0);
        Console.WriteLine(longestCommonSubsequence);
        Console.Read();

    }

    //Find the longest common subsequnce starting from index1 in A and index2 in B
    //Pass A as shorter string
    public static int LCS(String A, String B, int index1, int index2)
    {
        int max = 0;
        if (index1 == A.Length)
        {
            //You have reached beyond A and thus no subsequence
            return 0;
        }
        if (index2 == B.Length)
        {   //you may reach end of 2nd string. LCS from that end is 0
            return 0;
        }

        for (int i = index1; i < A.Length ; i++)
        {
            int exist = B.IndexOf(A[i],index2);
            if (exist != -1)
            {
             //   found = true;

                int temp = 1 + LCS(A, B, i + 1, exist + 1);
                if (max < temp)
                {
                    max = temp;
                }


            }


        }
        return max;

    }
  }
}

Upvotes: 8

Views: 11590

Answers (2)

Ash K
Ash K

Reputation: 3631

Based on the comments seen under the accepted answer, looks like there's an interest in seeing how the sequence is created. That's why I decided to add this answer.

This is a complete example using C# 12 in .NET 8. And tested using NUnit 3.

Visualizing the grid

Let's start with a simple example. Find Longest Common Subsequence between words: FISH and FOSH.

We only really care about how to fill cells from (1,1) to (word1.Length,word2.Length) and basic idea of filling those cells is:

  1. If the characters in row and col match, the value in this cell is (value of top-left neighbor) + 1. And also store this character in this cell.
  2. If the letters don't match, grab: MAX(value of top neighbor, value of left neighbor).
word2.len+1 cols 👉
word1.len+1 rows 👇
col=0 col=1 col=2 col=3 col=4
word2 len=4
Word2 👉
Word1👇
F O S H
row=0 0 (0,0)

null
0 (0,1)

null
0 (0,2)

null
0 (0,3)

null
0 (0,4)

null
row=1 F 0 (1,0)

null
1 (1,1)

'F'
1 (1,2)

null
1 (1,3)

null
1 (1,4)

null
row=2 I 0 (2,0)

null
1 (2,1)

null
1 (2,2)

null
1 (2,3)

null
1 (2,4)

null
row=3 S 0 (3,0)

null
1 (3,1)

null
1 (3,2)

null
2 (3,3)

'S'
2 (3,4)

null
row=4
word1 len=4
H 0 (4,0)

null
1 (4,1)

null
1 (4,2)

null
2 (4,3)

null
3 (4,4)

'H'

Backtracking the grid

The basic idea is:

  1. If the grid cell has a character, move towards diagonal.
  2. If the grid cell has no character, move towards the left or top neighbor (whichever is bigger).

This gives the sequence. In this case, it's "FSH".

(4,4) 👉 (3,3) 👉 (2,2) 👉 (2,1) 👉 (1,1)

Implementing the grid and backtracking

namespace Algorithms.DynamicProgramming;

public record Cell(int Value, char? Character);
public static class LongestCommonSubsequence
{
    public static Cell[,] Find(string word1, string word2)
    {
        var grid = new Cell[word1.Length + 1, word2.Length + 1];
        for (var row = 0; row <= word1.Length; row++)
        {
            // First row and first column are padded with zeroes, so we put nullable char in those places
            char? currentRowLetter = row == 0 ? null : word1[row - 1];
            for (var col = 0; col <= word2.Length; col++)
            {
                char? currentColumnLetter = col == 0 ? null : word2[col - 1];
                // Create the padding, so something like: grid[row - 1, col - 1] won't get you into Index out of bounds exceptions
                if (row == 0 || col == 0)
                {
                    grid[row, col] = new Cell(0, null);
                }
                else if (currentRowLetter == currentColumnLetter)
                {
                    var topLeftNeighbor = grid[row - 1, col - 1];
                    grid[row, col] = new Cell(topLeftNeighbor.Value + 1, currentRowLetter);
                }
                else
                {
                    var topNeighbor = grid[row - 1, col];
                    var leftNeighbor = grid[row, col - 1];
                    grid[row, col] = new Cell(Math.Max(topNeighbor.Value, leftNeighbor.Value), null);
                }
            }
        }

        return grid;
    }

    // Backtracking the grid
    public static string GetLcSubsequence(Cell[,] grid)
    {
        var lcs = new Stack<char>();

        // Only go through the words' length portion of the original grid.  For eg: (word1.Length,word2.Length) to (1,1)
        // Don't go to the padded row/ column, that's why -1
        var rows = grid.GetLength(0) - 1;
        var cols = grid.GetLength(1) - 1;
        
        while (rows > 0 && cols > 0)
        {
            // If you get a character, go towards diagonal
            if (grid[rows, cols].Character.HasValue)
            {
                lcs.Push(grid[rows, cols].Character!.Value);
                rows--;
                cols--;
            }
            // Otherwise, just go towards the left or top neighbor (whichever is bigger)
            else
            {
                if (grid[rows - 1, cols].Value > grid[rows, cols - 1].Value)
                    rows--; // If the top neighbor:grid[row - 1, col] is bigger, go towards it
                else
                    cols--; // If the left neighbor:grid[row, col - 1] is bigger, go towards it
            }
        }
        
        return new string(lcs.ToArray());
    }
}

Testing the code

using NUnit.Framework;
using Algorithms.DynamicProgramming;

namespace Algorithms.UnitTests.DynamicProgramming;

public class LongestCommonSubsequenceTests
{
    [Test]
    public void LCS_Finds_Longest_Common_Subsequence()
    {
        // Arrange
        const string word1 = "AAACCGTGAGTTATTCGTTCTAGAA";
        const string word2 = "CACCCCTAAGGTACCTTTGGTTC";

        // Act
        var grid = LongestCommonSubsequence.Find(word1, word2);
        var subsequence = LongestCommonSubsequence.GetLcSubsequence(grid);
        var subsequenceLength = grid.Cast<Cell>().MaxBy(c => c.Value)!.Value;

        // Assert
        Assert.That(subsequence, Is.EqualTo("ACCTAGTATTGTTC"));
        Assert.That(subsequenceLength, Is.EqualTo(14));
    }
}

Upvotes: 0

Heinzi
Heinzi

Reputation: 172220

Why do you think your algorithm is broken? The longest common subsequence is ACCTAGTATTGTTC, which is 14 characters long:

string B = "AAACCGTGAGTTATTCGTTCTAGAA";
              ^^^ ^ ^^ ^^^^ ^^^^

string A = "CACCCCTAAGGTACCTTTGGTTC";
             ^^^  ^ ^^ ^^  ^^ ^ ^^^

(I modified your algorithm to return the sequence instead of just the length.)

Upvotes: 9

Related Questions