Reputation: 1251

Most efficient way to search for unknown patterns in a string?

I am trying to find patterns that:

occur more than once
are more than 1 character long
are not substrings of any other known pattern

without knowing any of the patterns that might occur.

For example:

The string "the boy fell by the bell" would return 'ell', 'the b', 'y '.
The string "the boy fell by the bell, the boy fell by the bell" would return 'the boy fell by the bell'.

Using double for-loops, it can be brute forced very inefficiently:

ArrayList<String> patternsList = new ArrayList<>();
int length = string.length();
for (int i = 0; i < length; i++) {
    int limit = (length - i) / 2;
    for (int j = limit; j >= 1; j--) {
        int candidateEndIndex = i + j;
        String candidate = string.substring(i, candidateEndIndex);

        if(candidate.length() <= 1) {
            continue;
        }

        if (string.substring(candidateEndIndex).contains(candidate)) {
            boolean notASubpattern = true;
            for (String pattern : patternsList) {
                if (pattern.contains(candidate)) {
                    notASubpattern = false;
                    break;
                }
            }

            if (notASubpattern) {
                patternsList.add(candidate);
            }
        }
    }
}

However, this is incredibly slow when searching large strings with tons of patterns.

Upvotes: 18

Answers (5)

Koray

Reputation: 1806

I've written this just for fun. I hope I have understood the problem correctly, this is valid and fast enough; if not, please be easy on me :) I might optimize it a little more I guess, if someone finds it useful.

private static IEnumerable<string> getPatterns(string txt)
{
    char[] arr = txt.ToArray();
    BitArray ba = new BitArray(arr.Length);
    for (int shingle = getMaxShingleSize(arr); shingle >= 2; shingle--)
    {
        char[] arr1 = new char[shingle];
        int[] indexes = new int[shingle];
        HashSet<int> hs = new HashSet<int>();
        Dictionary<int, int[]> dic = new Dictionary<int, int[]>();
        for (int i = 0, count = arr.Length - shingle; i <= count; i++)
        {
            for (int j = 0; j < shingle; j++)
            {
                int index = i + j;
                arr1[j] = arr[index];
                indexes[j] = index;
            }
            int h = getHashCode(arr1);
            if (hs.Add(h))
            {
                int[] indexes1 = new int[indexes.Length];
                Buffer.BlockCopy(indexes, 0, indexes1, 0, indexes.Length * sizeof(int));
                dic.Add(h, indexes1);
            }
            else
            {
                bool exists = false;
                foreach (int index in indexes)
                    if (ba.Get(index))
                    {
                        exists = true;
                        break;
                    }
                if (!exists)
                {
                    int[] indexes1 = dic[h];
                    if (indexes1 != null)
                        foreach (int index in indexes1)
                            if (ba.Get(index))
                            {
                                exists = true;
                                break;
                            }
                }
                if (!exists)
                {
                    foreach (int index in indexes)
                        ba.Set(index, true);
                    int[] indexes1 = dic[h];
                    if (indexes1 != null)
                        foreach (int index in indexes1)
                            ba.Set(index, true);
                    dic[h] = null;
                    yield return new string(arr1);
                }
            }
        }
    }
}
private static int getMaxShingleSize(char[] arr)
{            
    for (int shingle = 2; shingle <= arr.Length / 2 + 1; shingle++)
    {
        char[] arr1 = new char[shingle];
        HashSet<int> hs = new HashSet<int>();
        bool noPattern = true;
        for (int i = 0, count = arr.Length - shingle; i <= count; i++)
        {
            for (int j = 0; j < shingle; j++)
                arr1[j] = arr[i + j];
            int h = getHashCode(arr1);
            if (!hs.Add(h))
            {
                noPattern = false;
                break;
            }
        }
        if (noPattern)
            return shingle - 1;
    }
    return -1;
}
private static int getHashCode(char[] arr)
{
    unchecked
    {
        int hash = (int)2166136261;
        foreach (char c in arr)
            hash = (hash * 16777619) ^ c.GetHashCode();
        return hash;
    }
}

Edit
My previous code has serious problems. This one is better:

private static IEnumerable<string> getPatterns(string txt)
{
    Dictionary<int, int> dicIndexSize = new Dictionary<int, int>();
    for (int shingle = 2, count0 = txt.Length / 2 + 1; shingle <= count0; shingle++)
    {   
        Dictionary<string, int> dic = new Dictionary<string, int>();
        bool patternExists = false;
        for (int i = 0, count = txt.Length - shingle; i <= count; i++)
        {
            string sub = txt.Substring(i, shingle);
            if (!dic.ContainsKey(sub))
                dic.Add(sub, i);
            else
            {   
                patternExists = true;
                int index0 = dic[sub];
                if (index0 >= 0)
                {
                    dicIndexSize[index0] = shingle;
                    dic[sub] = -1;
                }
            }
        }
        if (!patternExists)
            break;
    }
    List<int> lst = dicIndexSize.Keys.ToList();
    lst.Sort((a, b) => dicIndexSize[b].CompareTo(dicIndexSize[a]));
    BitArray ba = new BitArray(txt.Length);
    foreach (int i in lst)
    {
        bool ok = true;
        int len = dicIndexSize[i];
        for (int j = i, max = i + len; j < max; j++)
        {
            if (ok) ok = !ba.Get(j);
            ba.Set(j, true);
        }
        if (ok)
            yield return txt.Substring(i, len);
    }
}

Text in this book took 3.4sec in my computer.

Upvotes: 1

FortyTwo

Reputation: 2649

I would use Knuth–Morris–Pratt algorithm (linear time complexity O(n)) to find substrings. I would try to find the largest substring pattern, remove it from the input string and try to find the second largest and so on. I would do something like this:

string pattern = input.substring(0,lenght/2);
string toMatchString = input.substring(pattern.length, input.lenght - 1);

List<string> matches = new List<string>();

while(pattern.lenght > 0)
{
    int index = KMP(pattern, toMatchString);
    if(index > 0)
    {
        matches.Add(pattern);

        // remove the matched pattern occurences from the input string
        // I would do something like this:
        // 0 to pattern.lenght gets removed
        // check for all occurences of pattern in toMatchString and remove them
        // get the remaing shrinked input, reassign values for pattern & toMatchString
        // keep looking for the next largest substring
    }
    else
    {
        pattern = input.substring(0, pattern.lenght - 1);
        toMatchString = input.substring(pattern.length, input.lenght - 1);
    }
}

Where KMP implements Knuth–Morris–Pratt algorithm. You can find the Java implementations of it at Github or Princeton or write it yourself.

PS: I don't code in Java and it is quick try to my first bounty about to close soon. So please don't give me the stick if I missed something trivial or made a +/-1 error.

Upvotes: 0

David Eisenstat

Reputation: 65506

Suffix arrays are the right idea, but there's a non-trivial piece missing, namely, identifying what are known in the literature as "supermaximal repeats". Here's a GitHub repo with working code: https://github.com/eisenstatdavid/commonsub . Suffix array construction uses the SAIS library, vendored in as a submodule. The supermaximal repeats are found using a corrected version of the pseudocode from findsmaxr in Efficient repeat finding via suffix arrays (Becher–Deymonnaz–Heiber).

static void FindRepeatedStrings(void) {
  // findsmaxr from https://arxiv.org/pdf/1304.0528.pdf
  printf("[");
  bool needComma = false;
  int up = -1;
  for (int i = 1; i < Len; i++) {
    if (LongCommPre[i - 1] < LongCommPre[i]) {
      up = i;
      continue;
    }
    if (LongCommPre[i - 1] == LongCommPre[i] || up < 0) continue;
    for (int k = up - 1; k < i; k++) {
      if (SufArr[k] == 0) continue;
      unsigned char c = Buf[SufArr[k] - 1];
      if (Set[c] == i) goto skip;
      Set[c] = i;
    }
    if (needComma) {
      printf("\n,");
    }
    printf("\"");
    for (int j = 0; j < LongCommPre[up]; j++) {
      unsigned char c = Buf[SufArr[up] + j];
      if (iscntrl(c)) {
        printf("\\u%.4x", c);
      } else if (c == '\"' || c == '\\') {
        printf("\\%c", c);
      } else {
        printf("%c", c);
      }
    }
    printf("\"");
    needComma = true;
  skip:
    up = -1;
  }
  printf("\n]\n");
}

Here's a sample output on the text of the first paragraph:

Davids-MBP:commonsub eisen$ ./repsub input
["\u000a"
," S"
," as "
," co"
," ide"
," in "
," li"
," n"
," p"
," the "
," us"
," ve"
," w"
,"\""
,"&ndash;"
,"("
,")"
,". "
,"0"
,"He"
,"Suffix array"
,"`"
,"a su"
,"at "
,"code"
,"com"
,"ct"
,"do"
,"e f"
,"ec"
,"ed "
,"ei"
,"ent"
,"ere's a "
,"find"
,"her"
,"https://"
,"ib"
,"ie"
,"ing "
,"ion "
,"is"
,"ith"
,"iv"
,"k"
,"mon"
,"na"
,"no"
,"nst"
,"ons"
,"or"
,"pdf"
,"ri"
,"s are "
,"se"
,"sing"
,"sub"
,"supermaximal repeats"
,"te"
,"ti"
,"tr"
,"ub "
,"uffix arrays"
,"via"
,"y, "
]

Upvotes: 0

J. Michael Wuerth

Reputation: 292

You could use n-grams to find patterns in a string. It would take O(n) time to scan the string for n-grams. When you find a substring by using a n-gram, put it into a hash table with a count of how many times that substring was found in the string. When you're done searching for n-grams in the string, search the hash table for counts greater than 1 to find recurring patterns in the string.

For example, in the string "the boy fell by the bell, the boy fell by the bell" using a 6-gram will find the substring "the boy fell by the bell". A hash table entry with that substring will have a count of 2 because it occurred twice in the string. Varying the number of words in the n-gram will help you discover different patterns in the string.

Dictionary<string, int>dict = new Dictionary<string, int>();
int count = 0;
int ngramcount = 6;
string substring = "";

// Add entries to the hash table
while (count < str.length) {
    // copy the words into the substring
    int i = 0;
    substring = "";
    while (ngramcount > 0 && count < str.length) {
        substring[i] = str[count];
        if (str[i] == ' ')
            ngramcount--;
        i++;
        count++;
    }
    ngramcount = 6;
    substring.Trim();  // get rid of the last blank in the substring
    // Update the dictionary (hash table) with the substring
    if (dict.Contains(substring)) {  // substring is already in hash table so increment the count
        int hashCount = dict[substring];
        hashCount++;
        dict[substring] = hashCount;
    }
    else
        dict[substring] = 1;
}

// Find the most commonly occurrring pattern in the string
// by searching the hash table for the greatest count.
int maxCount = 0;
string mostCommonPattern = "";
foreach (KeyValuePair<string, int> pair in dict) {
    if (pair.Value > maxCount) {
        maxCount = pair.Value;
        mostCommonPattern = pair.Key;
    }
}

Upvotes: 8

Matt Timmermans

Reputation: 59368

You can build a suffix tree for your string in linear time: https://en.wikipedia.org/wiki/Suffix_tree

The patterns you are looking for are the strings corresponding to internal nodes that have only leaf children.

Upvotes: 24

Most efficient way to search for unknown patterns in a string?

Answers (5)

Related Questions