superG
superG

Reputation: 13

Find all ordered sequences with maximum length in strings

I have following problem to solve: There are two strings of arbitrary length with arbitrary content. I need to find all ordered sequences with maximum length, which appears in both strings.

Example 1: input: "a1b2c3" "1a2b3c" output: "123" "12c" "1b3" "1bc" "a23" "a2c" "ab3" "abc"

Example 2: input: "cadb" "abcd" output: "ab" "ad" "cd"

I wrote it in straight way with two loops, recursion, then removing duplicates and results which are part of larger result (for instance "abc" sequence contains "ab" "ac" and "bc" sequences, so I am filtering those)

// "match" argument here used as temporary buffer
void match_recursive(set<string> &matches, string &match, const string &a_str1, const string &a_str2, size_t a_pos1, size_t a_pos2)
{
    bool added = false;

    for(size_t i = a_pos1; i < a_str1.length(); ++i)
    {
        for(size_t j = a_pos2; j < a_str2.length(); ++j)
        {
            if(a_str1[i] == a_str2[j])
            {
                match.push_back(a_str1[i]);

                if(i < a_str1.length() - 1 && j < a_str2.length() - 1)
                    match_recursive(matches, match, a_str1, a_str2, i + 1, j + 1);
                else
                    matches.emplace(match);
                added = true;

                match.pop_back();
            }
        }
    }

    if(!added)
        matches.emplace(match);
}

This function solves problem, but complexity is unacceptable. For instance solution for "0q0e0t0c0a0d0a0d0i0e0o0p0z0" "0w0r0y0d0s0a0b0w0k0f0.0k0x0" takes 28 seconds on my machine (debug target, but anyway this is extremely slow). I think there should be some simple algorithm for this problem, but somehow I can't find any on the net.

Can you guys point me to right direction?

Upvotes: 1

Views: 1497

Answers (3)

jfly
jfly

Reputation: 7990

Here is the code for the dynamic programming solution. I test it with the examples you give. I have solved the LCS problem, but this is the first time to print them all.

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <set>

using namespace std;

#define MAX_LENGTH 100

int lcs(const char* a, const char* b)
{
    int row = strlen(a)+ 1;
    int column = strlen(b) + 1;

    //Memoization lower the function's time cost in exchange for space cost.
    int **matrix = (int**)malloc(sizeof(int*) * row);
    int i, j;
    for(i = 0; i < row; ++i)
        matrix[i] = (int*)calloc(sizeof(int), column);
    typedef set<string> lcs_set;

    lcs_set s_matrix[MAX_LENGTH][MAX_LENGTH];

    //initiate
    for(i = 0; i < MAX_LENGTH ; ++i)
        s_matrix[0][i].insert("");
    for(i = 0; i < MAX_LENGTH ; ++i)
        s_matrix[i][0].insert("");

    //Bottom up calculation
    for(i = 1; i < row; ++i)
    {
        for(j = 1; j < column; ++j)
        {
            if(a[i - 1] == b[j - 1])
            {
                matrix[i][j] = matrix[i -1][j - 1] + 1;
                // if your compiler support c++ 11, you can simplify this code.
                for(lcs_set::iterator it = s_matrix[i - 1][j - 1].begin(); it != s_matrix[i - 1][j - 1].end(); ++it)
                    s_matrix[i][j].insert(*it + a[i - 1]);
            }
            else
            {
                if(matrix[i][j - 1] > matrix[i - 1][j])
                {
                    matrix[i][j] = matrix[i][j - 1];
                    for(lcs_set::iterator it = s_matrix[i][j - 1].begin(); it != s_matrix[i][j - 1].end(); ++it)
                        s_matrix[i][j].insert(*it);
                }
                else if(matrix[i][j - 1] == matrix[i - 1][j])
                {
                    matrix[i][j] = matrix[i][j - 1];
                    for(lcs_set::iterator it = s_matrix[i][j - 1].begin(); it != s_matrix[i][j - 1].end(); ++it)
                        s_matrix[i][j].insert(*it);
                    for(lcs_set::iterator it = s_matrix[i - 1][j].begin(); it != s_matrix[i - 1][j].end(); ++it)
                        s_matrix[i][j].insert(*it);
                }
                else
                {
                    matrix[i][j] = matrix[i - 1][j];
                    for(lcs_set::iterator it = s_matrix[i - 1][j].begin(); it != s_matrix[i - 1][j].end(); ++it)
                        s_matrix[i][j].insert(*it);
                }

            }
        }
    }
    int lcs_length = matrix[row - 1][column -1];
    // all ordered sequences with maximum length are here.
    lcs_set result_set;

    int m, n;
    for(m = 1; m < row; ++m)
    {
        for(n = 1; n < column; ++n)
        {
            if(matrix[m][n] == lcs_length)
            {
                for(lcs_set::iterator it = s_matrix[m][n].begin(); it != s_matrix[m][n].end(); ++it)
                    result_set.insert(*it);
            }
        }
    }

    //comment it
    for(lcs_set::iterator it = result_set.begin(); it != result_set.end(); ++it)
        printf("%s\t", it->c_str());
    printf("\n");

    for(i = 0; i < row; ++i)
        free(matrix[i]);
    free(matrix);

    return lcs_length;
}

int main()
{
    char buf1[MAX_LENGTH], buf2[MAX_LENGTH];
    while(scanf("%s %s", buf1, buf2) != EOF)
    {
        printf("length is: %d\n", lcs(buf1, buf2) );
    }
    return 0;
} 

Upvotes: 0

user2566092
user2566092

Reputation: 4661

Look up "longest common subsequence (LCS)" problem, e.g. http://en.wikipedia.org/wiki/Longest_common_subsequence_problem and see how the dynamic programming solution works to find a LCS of two sequences, based on building up the solution efficiently starting with trivially getting the the LCS for the first character of each sequence, and then building up the LCS solution for longer and longer pairs of prefixes of the two sequences. The only modification you need to make is that when you get the LCS for a current prefix pair from the previously computed LCS solutions for earlier prefix pairs, you need to have stored ALL previous LCS strings for the earlier prefix pairs, and then combine these sets of LCS strings together (possibly with an added character) into an overall set of LCS strings you store for the current prefix pair. This will solve your problem efficiently. You can solve even a bit more efficiently by first just getting a single LCS and getting the overall LCS length, and then finding all earlier prefix pairs that contribute to computational paths that obtain the LCS length, and then going back and repeating the dynamic programming iterations just for those prefix pairs, and this time keeping track of all possible LCS sequences like I described earlier.

Upvotes: 2

Steve
Steve

Reputation: 5545

Sounds like you are trying to find similarities between 2 string? I found this code, and modified slightly, somewhere on the web many years ago (sorry I cannot quote the source any longer) and use it often. It works very quick (for strings anyway). You may need to change for your purpose. Sorry it's in VB.

Private Shared piScore As Integer
''' <summary>
''' Compares two not-empty strings regardless of case. 
''' Returns a numeric indication of their similarity 
''' (0 = not at all similar, 100 = identical)
''' </summary>
''' <param name="psStr1">String to compare</param>
''' <param name="psStr2">String to compare</param>
''' <returns>0-100 (0 = not at all similar, 100 = identical)</returns>
''' <remarks></remarks>
Public Shared Function Similar(ByVal psStr1 As String, ByVal psStr2 As String) As Integer
    If psStr1 Is Nothing Or psStr2 Is Nothing Then Return 0

    ' Convert each string to simplest form (letters
    ' and digits only, all upper case)
    psStr1 = ReplaceSpecial(psStr1.ToUpper)
    psStr2 = ReplaceSpecial(psStr2.ToUpper)

    If psStr1.Trim = "" Or psStr2.Trim = "" Then
        ' One or both of the strings is now empty
        Return 0
    End If

    If psStr1 = psStr2 Then
        ' Strings are identical
        Return 100
    End If

    ' Initialize cumulative score (this will be the
    ' total length of all the common substrings)
    piScore = 0

    ' Find all common sub-strings
    FindCommon(psStr1, psStr2)

    ' We now have the cumulative score. Return this
    ' as a percent of the maximum score. The maximum
    ' score is the average length of the two strings.
    Return piScore * 200 / (Len(psStr1) + Len(psStr2))

End Function

''' <summary>USED BY SIMILAR FUNCTION</summary>
Private Shared Sub FindCommon(ByVal psS1 As String, ByVal psS2 As String)
    ' Finds longest common substring (other than single
    ' characters) in psS1 and psS2, then recursively
    ' finds longest common substring in left-hand
    ' portion and right-hand portion. Updates the
    ' cumulative score.

    Dim iLongest As Integer = 0, iStartPos1 As Integer = 0, iStartPos2 As Integer = 0, iJ As Integer = 0
    Dim sHoldStr As String = "", sTestStr As String = "", sLeftStr1 As String = "", sLeftStr2 As String = ""
    Dim sRightStr1 As String = "", sRightStr2 As String = ""

    sHoldStr = psS2
    Do While Len(sHoldStr) > iLongest

        sTestStr = sHoldStr
        Do While Len(sTestStr) > 1
            iJ = InStr(psS1, sTestStr)
            If iJ > 0 Then
                ' Test string is sub-set of the other string

                If Len(sTestStr) > iLongest Then
                    ' Test string is longer than previous
                    ' longest. Store its length and position.
                    iLongest = Len(sTestStr)
                    iStartPos1 = iJ
                    iStartPos2 = InStr(psS2, sTestStr)
                End If

                ' No point in going further with this string
                Exit Do

            Else
                ' Test string is not a sub-set of the other
                ' string. Discard final character of test
                ' string and try again.
                sTestStr = Left(sTestStr, Len(sTestStr) - 1)
            End If

        Loop

        ' Now discard first char of test string and
        ' repeat the process.
        sHoldStr = Right(sHoldStr, Len(sHoldStr) - 1)

    Loop

    ' Update the cumulative score with the length of
    ' the common sub-string.
    piScore = piScore + iLongest

    ' We now have the longest common sub-string, so we
    ' can isolate the sub-strings to the left and right
    ' of it.

    If iStartPos1 > 3 And iStartPos2 > 3 Then
        sLeftStr1 = Left(psS1, iStartPos1 - 1)
        sLeftStr2 = Left(psS2, iStartPos2 - 1)

        If sLeftStr1.Trim <> "" And sLeftStr2.Trim <> "" Then
            ' Get longest common substring from left strings
            FindCommon(sLeftStr1, sLeftStr2)
        End If
    Else
        sLeftStr1 = ""
        sLeftStr2 = ""
    End If
    If iLongest > 0 Then
        sRightStr1 = Mid(psS1, iStartPos1 + iLongest)
        sRightStr2 = Mid(psS2, iStartPos2 + iLongest)

        If sRightStr1.Trim <> "" And sRightStr2.Trim <> "" Then
            ' Get longest common substring from right strings
            FindCommon(sRightStr1, sRightStr2)
        End If
    Else
        sRightStr1 = ""
        sRightStr2 = ""
    End If
End Sub

''' <summary>USED BY SIMILAR FUNCTION</summary>
Private Shared Function ReplaceSpecial(ByVal sString As String) As String
    Dim iPos As Integer
    Dim sReturn As String = ""
    Dim iAsc As Integer
    For iPos = 1 To sString.Length
        iAsc = Asc(Mid(sString, iPos, 1))
        If (iAsc >= 48 And iAsc <= 57) Or (iAsc >= 65 And iAsc <= 90) Then
            sReturn &= Chr(iAsc)
        End If
    Next
    Return sReturn
End Function

Just call the Similar function and you get a result between 0 at 100.

Hope this helps

Upvotes: 0

Related Questions