Reputation: 1313

C++ - search duplicate strings in large text

My aim is to find all duplicate strings of any length in the text (matched strings should not intersect). For this purpose I use the following code

   #include <string>
   using namespace std;

   int main()
   {
      string text = "j73vd6hdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j";
      int len =  text.length();

         for(int m=0;m<len-1;m++)
         {
           int h_len=(len-m)/2;

           for(int i=0;i<h_len;i++)
           {
              string a1 = text.substr(m,i+1);
              for(int k=0;k<len-2*i-1-m;k++)
              {
                  string a2 = text.substr(i+1+k+m,i+1);
                  if(a1==a2) { /* do something */ }
              }

           }
         }

     return 0;     
   }

the script works but when the text size increases, the time of execution extremely increases as well. The program is too slow. How can I speed up my program? Can you give me any suggestion to improve the code? Maybe there is a better algorithm to do that.

Upvotes: 3

Answers (5)

user2249683

Reputation:

You might combine the search with Boyer-Moore-Search or keep track of common prefix sequences.

[1] Boyer-Moore

#include <algorithm>
#include <iostream>
#include <limits>
#include <memory>
#include <tuple>

class SkipTable
{
    public:
    typedef std::size_t size_type;

    private:
    enum { TableSize = unsigned(std::numeric_limits<unsigned char>::max()) + 1 };

    public:
    SkipTable()
    {
        size_type size;
        std::tie(m_table, size) = std::get_temporary_buffer<size_type>(TableSize);
        if(TableSize <= size) std::fill_n(m_table, TableSize, 1);
        else {
            std::return_temporary_buffer<size_type>(m_table);
            m_table = 0;
        }
    }

    private:
    SkipTable(const SkipTable&); // No copy
    SkipTable& operator = (const SkipTable&); // No copy

    public:
    ~SkipTable() {
        std::return_temporary_buffer<size_type>(m_table);
    }

    void set(const char* s, size_type len) {
        if(len < 1) len = 1;
        if(m_table) {
            m_len = len;
            std::fill_n(m_table, TableSize, m_len);
            while(len)
                m_table[unsigned(*s++)] = len--;
        }
    }

    const size_type get(size_type pos) const {
        size_type n = (m_table && pos < TableSize) ? m_table[pos] : 1;
        return n;
    }

    operator bool() const { return m_table; }

    private:
    size_type* m_table;
    size_type m_len;
};


const char* find(
    SkipTable& skip,
    const char* str, const std::size_t strlen,
    const char* substr, const std::size_t substrlen)
{
    typedef std::char_traits<char> traits_type;
    typedef std::size_t size_type;

    if (substrlen == 0 || strlen < substrlen) return 0;

    const char* end = str + strlen - substrlen;

    if(skip && 4 < substrlen && substrlen < strlen - substrlen) {

        // Boyer-Moore-Search
        //===================


        skip.set(substr, substrlen);
        while(str <= end) {
            if(traits_type::compare(str, substr, substrlen) == 0)
                break;
            str += skip.get(*(str + substrlen));
        }
        return (str <= end) ? str : 0;
    }
    else {
        // Brute Search
        //=============

        while(str <= end) {
            if (traits_type::compare(str, substr, substrlen) == 0)
                return str;
            ++str;
        }
    }
    return 0;
}


#include <map>

void find_duplicates(const char* s, const std::size_t size) {
    std::map<std::string, unsigned> duplicates;
    SkipTable skip;
    for(std::size_t n = 1; n < size / 2; ++n) {
        for(std::size_t i = 0; i <= size - 2*n; ++i) {
            const char* p = find(skip, s + i + n, size - i - n, s + i, n);
            if(p) {
                ++duplicates[std::string(p, n)];
            }
        }
    }
    // Increment the counts
    for(auto& d : duplicates) {
        ++d.second;
        std::cout << '[' << d.second << "] \"" << d.first << "\"\n";
    }
}

int main() {
    std::string text = "Some text for matching sub-strings in the text.";
    find_duplicates(text.c_str(), text.size());
}

Note: The search might get improved by mapping results and performing a search if a prefix sequence exits (see code below).

[2] Common Prefix Sequences

A faster approach: Store the pointers with a common prefix sequence and match suffix characters following the prefix sequence:

void find_duplications(const char* str, const std::size_t size) {

    if(size <= 1) return;

    struct Pointer {
        const char* p;
        std::size_t n;

        Pointer(const char* p, std::size_t n)
        :   p(p), n(n)
        {}

        bool operator < (const Pointer& other) const {
            typedef std::char_traits<char> traits_type;
            int result = traits_type::compare(p, other.p, std::min(n, other.n));
            if(result == 0) return (n < other.n);
            else return result < 0;
        }
    };

    typedef std::map<Pointer, unsigned> duplicate_container;
    duplicate_container duplicates;

    // Prefix
    std::vector<const char*> prefix;
    for(std::size_t i = 0; i < size - 1; ++i) {
        duplicate_container::iterator pos = duplicates.find(Pointer(str+i, 1));
        if(pos == duplicates.end()) {
            for(std::size_t j = i + 1; j < size ; ++j) {
                if(str[i] == str[j]) {
                    prefix.push_back(str+i);
                    prefix.push_back(str+j);
                    pos = duplicates.insert(duplicate_container::value_type(
                        Pointer(str+i, 1), 2)).first;
                    for(std::size_t k = j + 1; k < size ; ++k) {
                        if(str[i] == str[k]) {
                            prefix.push_back(str+k);
                            ++pos->second;
                        }
                    }
                    // Delimiter
                    prefix.push_back(0);
                    break;
                }
            }
        }
    }

    // Suffix
    std::vector<const char*> suffix;
    const char* limit = str + size;
    std::size_t len = 1;
    while( ! prefix.empty()) {
        ++len;
        --limit;
        suffix.clear();
        for(std::size_t i = 0; i < prefix.size(); ++i) {
            const char* p = prefix[i];
            if( ! p) continue;
            if(limit <= p) break;
            duplicate_container::iterator pos = duplicates.find(Pointer(p, len));
            if(pos == duplicates.end()) {
                for(std::size_t j = i + 1; j < prefix.size(); ++j) {
                    const char* q = prefix[j];
                    if( ! q || limit <= q) break;
                    if(p + len <= q && p[len-1] == q[len-1]) {
                        suffix.push_back(p);
                        suffix.push_back(q);
                        pos = duplicates.insert(duplicate_container::value_type(
                            Pointer(p, len), 2)).first;
                        for(std::size_t k = j + 1; k < prefix.size(); ++k) {
                            q = prefix[k];
                            if( ! q || limit <= q) break;
                            if(p[len-1] == q[len-1]) {
                                suffix.push_back(q);
                                ++pos->second;
                            }
                        }
                        // Delimiter
                        suffix.push_back(0);
                        break;
                    }
                }
            }
        }
        prefix.swap(suffix);
    }

    for(duplicate_container::iterator pos = duplicates.begin(); pos != duplicates.end(); ++pos) {
        std::cout
            << '[' << pos->second << "] \""
            << std::string(pos->first.p, pos->first.n) << "\"\n";
    }
}

int main() {
    std::string text = "Some text for matching sub-strings in the text.";
    find_duplications(text.c_str(), text.size());
}

Results Both algorithms will produce the same result set, but the second will out perform the first by magnitudes.

[7] " "
[3] " t"
[2] " te"
[2] " tex"
[2] " text"
[4] "e"
[2] "e "
[2] "e t"
[2] "e te"
[2] "e tex"
[2] "e text"
[2] "ex"
[2] "ext"
[2] "g"
[2] "h"
[3] "i"
[3] "in"
[2] "ing"
[2] "m"
[3] "n"
[2] "ng"
[2] "o"
[2] "r"
[3] "s"
[7] "t"
[2] "te"
[2] "tex"
[2] "text"
[2] "x"
[2] "xt"

Upvotes: 2

jimifiki

Reputation: 5534

You can do something in order to speed up the algorithm by using binary search algorithms for insertion and retrival.

You can do it for instance by using a std::set or a std::map. You have either to store each substring into the set (if /*do something */ only needs the string) or to store the substring in a map as key.

The complexity then would be ln(N) N^2. (Instead of being N^3)

Consider this for instance:

#include <string>
#include <map>
int main(int argc, char **argv)
{
  std::string text = "j73vd6hdk9382haswm03hs84mmsg73vdflw94ncjd93k9dj3ndi5jf95j";
  size_t len =  text.length();

  std::map<std::string,size_t> table; 
  std::map<std::string,size_t>::const_iterator myEntry; 
  for(int m=0;m<len-1;m++)
  {
    int max_len=len-m;
    for(size_t i=0;i<max_len;i++)
    {
       std::string a1 = text.substr(m,i+1);
       myEntry = table.find(a1); 
       if (myEntry!= table.end()){
        if ((myEntry->second + i ) < m)         
         /*std::cout << a1 << " "; */
       }
       else 
          table.insert(std::pair<std::string,size_t>(a1,m));
    }
  }
  return 0;
}

Upvotes: 1

4pie0

Reputation: 29724

I fill a map of duplicates here and return max of them. I have added check for what has been already searched. This is very important as you don't want to search again for occurrences of "a" if you have already did it. For example:

fabfcdfefghijklmnopf
^  ^  ^ ^          ^

My algorithm will count all duplicates of "f" on first hit of "f"

fabfcdfefghijklmnopf
^ // count all "f"

and then skip it when it finds it again at index 4:

fabfcdfefghijklmnopf
   ^ // skip as "f" is prersent on map

but it will evaluate then "fc" as this is not the same as "fa". Algorithm will search for "fc" only if "f" was found to be duplicated, as there is no chance for "abc" be duplicated if "ab" is not.

It will not count overlapped strings, i.e: "altoal" in "paltoaltoalm" won't be matched.

#include <iostream>
#include <string>
#include <map>

int findDuplicates( std::string& in, std::map<std::string, int>& m) {
    size_t in_s = in.size();
    if ( in_s == 0) return (-1);

    for ( size_t i = 0; i < in_s; i++) {
        size_t pos_beg = i;
        size_t pos_end = pos_beg + 1;
        while ( pos_end < in_s) {
            std::string searched =  in.substr( pos_beg, pos_end - pos_beg);
            if( m.find( searched) != m.end()) {
                ++pos_end;
                continue;
            }
            bool present = false;
            size_t found;
            while( ( found = in.substr( pos_end).find( searched)) != std::string::npos) {
                present = true;
                m[ searched]++;
                pos_end = pos_end + found + searched.size();
            }
            if( !present) 
                break;
            else
            pos_end = pos_beg + searched.size() + 1;
        }
    }
    int max = 0;
    for (std::map<std::string, int>::const_iterator it = m.begin(); it != m.end(); ++it) {
        if (it ->second > max) {
            max = it->second;
        }
    }
    return max;
}

usage:

/*
 * 
 */
int main(int argc, char** argv) {

    std::string in( "j73vd6hdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j");
    std::map<std::string, int> duplicates;
    int rc = findDuplicates( in, duplicates);

    std::map<std::string, int>::iterator it = duplicates.begin();
    while( it != duplicates.end()) {
        std::cout << (*it).first << "," << it->second << std::endl;
        ++it;
    }
    return 0;
}

output:

3,5

4,1

5,1

5j,1

7,1

73,1

8,1

9,4

93,1

d,4

f,1

h,2

j,4

k,1

k9,1

m,2

Upvotes: 1

tenfour

Reputation: 36896

My feeling is there is no optimized way to do such a broad search type, unfortunately. Your search space is large, and the number of searches is equally huge.

You're essentially looking for duplicates of every permutation of pos/length. Existing search algorithms are great for doing a single search within a big space, so at best they can help with a part of your algorithm. In other words, you are doing many many string searches, so you could try optimizing each single string search.

You could still try optimizing your existing algorithm. For example you might find that working with char* instead of string could help, because you can keep finer control over state. That would eliminate the need for substr which creates an unnecessary string object.

*Edit: mention how to incorporate existing string search algorithms.

Upvotes: 2

simurg

Reputation: 1208

I agree with tenfour that there may not be an algorithm that will make you solve this problem a lot shorter. Below code is yours transformed to C, which, AFAICS with my crude measurement with linux' "time", is about 7 times faster than the C++ version:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main()
{
        int count = 0;
        char text[] = "j73vd66hmmsg73flw94nncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kcjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94nk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j";
        int len = strlen(text);
        char* a1 = malloc(len);
        char* a2 = malloc(len);

        for(int m=0;m<len-1;m++)
        {
                int h_len=(len-m)/2;

                for(int i=0;i<h_len;i++)
                {
                        memcpy(a1, &text[m], i+1);
                        a1[i+1] = '\0';
                        for(int k=0;k<len-2*i-1-m;k++)
                        {
                                memcpy(a2, &text[i+1+k+m], i+1);
                                a2[i+1] = '\0';
                                if(a1[0] == a2[0]) { 
                                        count++;
                                }
                        }

                }
        }
        printf("Count is: %d\n", count);

        return 0;     
}

Please note that the text in example is longer to be able to get some meaningful run times, a variable count is added to be able to print the number of matches and no clean up is done before exit. On my Ubuntu PC, C code takes ~2 seconds to complete, where C++ takes ~14 seconds.

Upvotes: 1

C++ - search duplicate strings in large text

Answers (5)

Related Questions