Reputation: 13
I have following problem to solve: There are two strings of arbitrary length with arbitrary content. I need to find all ordered sequences with maximum length, which appears in both strings.
Example 1: input: "a1b2c3" "1a2b3c" output: "123" "12c" "1b3" "1bc" "a23" "a2c" "ab3" "abc"
Example 2: input: "cadb" "abcd" output: "ab" "ad" "cd"
I wrote it in straight way with two loops, recursion, then removing duplicates and results which are part of larger result (for instance "abc" sequence contains "ab" "ac" and "bc" sequences, so I am filtering those)
// "match" argument here used as temporary buffer
void match_recursive(set<string> &matches, string &match, const string &a_str1, const string &a_str2, size_t a_pos1, size_t a_pos2)
{
bool added = false;
for(size_t i = a_pos1; i < a_str1.length(); ++i)
{
for(size_t j = a_pos2; j < a_str2.length(); ++j)
{
if(a_str1[i] == a_str2[j])
{
match.push_back(a_str1[i]);
if(i < a_str1.length() - 1 && j < a_str2.length() - 1)
match_recursive(matches, match, a_str1, a_str2, i + 1, j + 1);
else
matches.emplace(match);
added = true;
match.pop_back();
}
}
}
if(!added)
matches.emplace(match);
}
This function solves problem, but complexity is unacceptable. For instance solution for "0q0e0t0c0a0d0a0d0i0e0o0p0z0" "0w0r0y0d0s0a0b0w0k0f0.0k0x0" takes 28 seconds on my machine (debug target, but anyway this is extremely slow). I think there should be some simple algorithm for this problem, but somehow I can't find any on the net.
Can you guys point me to right direction?
Upvotes: 1
Views: 1497
Reputation: 7990
Here is the code for the dynamic programming solution. I test it with the examples you give. I have solved the LCS problem, but this is the first time to print them all.
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <set>
using namespace std;
#define MAX_LENGTH 100
int lcs(const char* a, const char* b)
{
int row = strlen(a)+ 1;
int column = strlen(b) + 1;
//Memoization lower the function's time cost in exchange for space cost.
int **matrix = (int**)malloc(sizeof(int*) * row);
int i, j;
for(i = 0; i < row; ++i)
matrix[i] = (int*)calloc(sizeof(int), column);
typedef set<string> lcs_set;
lcs_set s_matrix[MAX_LENGTH][MAX_LENGTH];
//initiate
for(i = 0; i < MAX_LENGTH ; ++i)
s_matrix[0][i].insert("");
for(i = 0; i < MAX_LENGTH ; ++i)
s_matrix[i][0].insert("");
//Bottom up calculation
for(i = 1; i < row; ++i)
{
for(j = 1; j < column; ++j)
{
if(a[i - 1] == b[j - 1])
{
matrix[i][j] = matrix[i -1][j - 1] + 1;
// if your compiler support c++ 11, you can simplify this code.
for(lcs_set::iterator it = s_matrix[i - 1][j - 1].begin(); it != s_matrix[i - 1][j - 1].end(); ++it)
s_matrix[i][j].insert(*it + a[i - 1]);
}
else
{
if(matrix[i][j - 1] > matrix[i - 1][j])
{
matrix[i][j] = matrix[i][j - 1];
for(lcs_set::iterator it = s_matrix[i][j - 1].begin(); it != s_matrix[i][j - 1].end(); ++it)
s_matrix[i][j].insert(*it);
}
else if(matrix[i][j - 1] == matrix[i - 1][j])
{
matrix[i][j] = matrix[i][j - 1];
for(lcs_set::iterator it = s_matrix[i][j - 1].begin(); it != s_matrix[i][j - 1].end(); ++it)
s_matrix[i][j].insert(*it);
for(lcs_set::iterator it = s_matrix[i - 1][j].begin(); it != s_matrix[i - 1][j].end(); ++it)
s_matrix[i][j].insert(*it);
}
else
{
matrix[i][j] = matrix[i - 1][j];
for(lcs_set::iterator it = s_matrix[i - 1][j].begin(); it != s_matrix[i - 1][j].end(); ++it)
s_matrix[i][j].insert(*it);
}
}
}
}
int lcs_length = matrix[row - 1][column -1];
// all ordered sequences with maximum length are here.
lcs_set result_set;
int m, n;
for(m = 1; m < row; ++m)
{
for(n = 1; n < column; ++n)
{
if(matrix[m][n] == lcs_length)
{
for(lcs_set::iterator it = s_matrix[m][n].begin(); it != s_matrix[m][n].end(); ++it)
result_set.insert(*it);
}
}
}
//comment it
for(lcs_set::iterator it = result_set.begin(); it != result_set.end(); ++it)
printf("%s\t", it->c_str());
printf("\n");
for(i = 0; i < row; ++i)
free(matrix[i]);
free(matrix);
return lcs_length;
}
int main()
{
char buf1[MAX_LENGTH], buf2[MAX_LENGTH];
while(scanf("%s %s", buf1, buf2) != EOF)
{
printf("length is: %d\n", lcs(buf1, buf2) );
}
return 0;
}
Upvotes: 0
Reputation: 4661
Look up "longest common subsequence (LCS)" problem, e.g. http://en.wikipedia.org/wiki/Longest_common_subsequence_problem and see how the dynamic programming solution works to find a LCS of two sequences, based on building up the solution efficiently starting with trivially getting the the LCS for the first character of each sequence, and then building up the LCS solution for longer and longer pairs of prefixes of the two sequences. The only modification you need to make is that when you get the LCS for a current prefix pair from the previously computed LCS solutions for earlier prefix pairs, you need to have stored ALL previous LCS strings for the earlier prefix pairs, and then combine these sets of LCS strings together (possibly with an added character) into an overall set of LCS strings you store for the current prefix pair. This will solve your problem efficiently. You can solve even a bit more efficiently by first just getting a single LCS and getting the overall LCS length, and then finding all earlier prefix pairs that contribute to computational paths that obtain the LCS length, and then going back and repeating the dynamic programming iterations just for those prefix pairs, and this time keeping track of all possible LCS sequences like I described earlier.
Upvotes: 2
Reputation: 5545
Sounds like you are trying to find similarities between 2 string? I found this code, and modified slightly, somewhere on the web many years ago (sorry I cannot quote the source any longer) and use it often. It works very quick (for strings anyway). You may need to change for your purpose. Sorry it's in VB.
Private Shared piScore As Integer
''' <summary>
''' Compares two not-empty strings regardless of case.
''' Returns a numeric indication of their similarity
''' (0 = not at all similar, 100 = identical)
''' </summary>
''' <param name="psStr1">String to compare</param>
''' <param name="psStr2">String to compare</param>
''' <returns>0-100 (0 = not at all similar, 100 = identical)</returns>
''' <remarks></remarks>
Public Shared Function Similar(ByVal psStr1 As String, ByVal psStr2 As String) As Integer
If psStr1 Is Nothing Or psStr2 Is Nothing Then Return 0
' Convert each string to simplest form (letters
' and digits only, all upper case)
psStr1 = ReplaceSpecial(psStr1.ToUpper)
psStr2 = ReplaceSpecial(psStr2.ToUpper)
If psStr1.Trim = "" Or psStr2.Trim = "" Then
' One or both of the strings is now empty
Return 0
End If
If psStr1 = psStr2 Then
' Strings are identical
Return 100
End If
' Initialize cumulative score (this will be the
' total length of all the common substrings)
piScore = 0
' Find all common sub-strings
FindCommon(psStr1, psStr2)
' We now have the cumulative score. Return this
' as a percent of the maximum score. The maximum
' score is the average length of the two strings.
Return piScore * 200 / (Len(psStr1) + Len(psStr2))
End Function
''' <summary>USED BY SIMILAR FUNCTION</summary>
Private Shared Sub FindCommon(ByVal psS1 As String, ByVal psS2 As String)
' Finds longest common substring (other than single
' characters) in psS1 and psS2, then recursively
' finds longest common substring in left-hand
' portion and right-hand portion. Updates the
' cumulative score.
Dim iLongest As Integer = 0, iStartPos1 As Integer = 0, iStartPos2 As Integer = 0, iJ As Integer = 0
Dim sHoldStr As String = "", sTestStr As String = "", sLeftStr1 As String = "", sLeftStr2 As String = ""
Dim sRightStr1 As String = "", sRightStr2 As String = ""
sHoldStr = psS2
Do While Len(sHoldStr) > iLongest
sTestStr = sHoldStr
Do While Len(sTestStr) > 1
iJ = InStr(psS1, sTestStr)
If iJ > 0 Then
' Test string is sub-set of the other string
If Len(sTestStr) > iLongest Then
' Test string is longer than previous
' longest. Store its length and position.
iLongest = Len(sTestStr)
iStartPos1 = iJ
iStartPos2 = InStr(psS2, sTestStr)
End If
' No point in going further with this string
Exit Do
Else
' Test string is not a sub-set of the other
' string. Discard final character of test
' string and try again.
sTestStr = Left(sTestStr, Len(sTestStr) - 1)
End If
Loop
' Now discard first char of test string and
' repeat the process.
sHoldStr = Right(sHoldStr, Len(sHoldStr) - 1)
Loop
' Update the cumulative score with the length of
' the common sub-string.
piScore = piScore + iLongest
' We now have the longest common sub-string, so we
' can isolate the sub-strings to the left and right
' of it.
If iStartPos1 > 3 And iStartPos2 > 3 Then
sLeftStr1 = Left(psS1, iStartPos1 - 1)
sLeftStr2 = Left(psS2, iStartPos2 - 1)
If sLeftStr1.Trim <> "" And sLeftStr2.Trim <> "" Then
' Get longest common substring from left strings
FindCommon(sLeftStr1, sLeftStr2)
End If
Else
sLeftStr1 = ""
sLeftStr2 = ""
End If
If iLongest > 0 Then
sRightStr1 = Mid(psS1, iStartPos1 + iLongest)
sRightStr2 = Mid(psS2, iStartPos2 + iLongest)
If sRightStr1.Trim <> "" And sRightStr2.Trim <> "" Then
' Get longest common substring from right strings
FindCommon(sRightStr1, sRightStr2)
End If
Else
sRightStr1 = ""
sRightStr2 = ""
End If
End Sub
''' <summary>USED BY SIMILAR FUNCTION</summary>
Private Shared Function ReplaceSpecial(ByVal sString As String) As String
Dim iPos As Integer
Dim sReturn As String = ""
Dim iAsc As Integer
For iPos = 1 To sString.Length
iAsc = Asc(Mid(sString, iPos, 1))
If (iAsc >= 48 And iAsc <= 57) Or (iAsc >= 65 And iAsc <= 90) Then
sReturn &= Chr(iAsc)
End If
Next
Return sReturn
End Function
Just call the Similar function and you get a result between 0 at 100.
Hope this helps
Upvotes: 0