te7
te7

Reputation: 609

How to hash very large substrings quickly without collisions?

I have an app which as part of it finds all palindrome substrings of the input string. The input string can be up to 100,000 in length so the substrings can be very large. For example one input to the app resulted in over 300,000 substring palindromes over 10,000 in length. The app later counts all palindromes for equality and counts the unique ones by a hash that uses the standard hash that is done in the function that finds the palindromes. The hashes are stored in a vector and later counted for uniqueness in the app. The problems with such input and outptut conditions is the hashing for the very large substrings takes too long plus gets collisions in the hashes. So I was wondering if there is an algorithm (hash) that can quickly and uniquely hash a very large substring (preferably by index range for the substring for speed, but with accuracy for uniqueness). The hashing is done at the end of the function get_palins. The code is below.

#include <iostream>
#include <string>
#include <cstdlib>
#include <time.h>
#include <vector>
#include <algorithm>
#include <unordered_map>
#include <map>
#include <cstdio>
#include <cmath>
#include <ctgmath>

using namespace std;

#define MAX 100000
#define mod 1000000007

vector<long long> palins[MAX+5];

//  Finds all palindromes for the string
void  get_palins(string &s)
{
     int N = s.length();
     int i, j, k,   // iterators
     rp,        // length of 'palindrome radius'
     R[2][N+1]; // table for storing results (2 rows for odd- and even-length palindromes

     s = "@" + s + "#"; // insert 'guards' to iterate easily over s

     for(j = 0; j <= 1; j++)
     {
         R[j][0] = rp = 0; i = 1;

         while(i <= N)
         {
             while(s[i - rp - 1] == s[i + j + rp]) {  rp++;  }
             R[j][i] = rp;
             k = 1;
             while((R[j][i - k] != rp - k) && (k < rp))
             {
                 R[j][i + k] = min(R[j][i - k],rp - k);
                 k++;
             }
             rp = max(rp - k,0);
             i += k;
         }
     }

     s = s.substr(1,N); // remove 'guards'

     for(i = 1; i <= N; i++)
     {
        for(j = 0; j <= 1; j++)
            for(rp = R[j][i]; rp > 0; rp--)
            {
                int begin = i - rp - 1;
                int end_count = 2 * rp + j;
                int end = begin + end_count - 1;
                if (!(begin == 0  && end == N -1 ))
                {
                   string ss = s.substr(begin, end_count);
                   long long hsh = hash<string>{}(ss);
                   palins[begin].push_back(hsh);

                }
          }
     }
}
unordered_map<long long, int> palin_counts;
unordered_map<char, int> end_matches;

// Solve when at least 1 character in string is different
void solve_all_not_same(string &s)
{
    int n = s.length();
    long long count = 0;

    get_palins(s);

    long long palin_count = 0;

    // Gets all palindromes into unordered map
    for (int i = 0; i <= n; i++)
    {
        for (auto& it : palins[i])
        {
            if (palin_counts.find(it)  == palin_counts.end())
            {
                palin_counts.insert({it,1});
            }
            else
            {
                palin_counts[it]++;
            }
        }
    }

    // From total palindromes, get proper border count
    // minus end characters of substrings
    for ( auto it = palin_counts.begin(); it != palin_counts.end(); ++it )
    {
        int top = it->second - 1;

        palin_count += (top * (top + 1)) / 2;
        palin_count %= mod;
    }

    // Store string character counts in unordered map
    for (int i = 0; i <= n; i++)
    {
        char c = s[i];

        //long long hsh = hash<char>{}(c);

        if (end_matches[c] == 0)
            end_matches[c] = 1;
        else
            end_matches[c]++;

    }

    // From substring end character matches, get proper border count
    // for end characters of substrings
    for ( auto it = end_matches.begin(); it != end_matches.end(); it++ )
    {
        int f = it->second - 1;
        count += (f * (f + 1)) / 2;
    }

    cout << (count + palin_count) % mod << endl;

    for (int i = 0; i < MAX+5; i++)
        palins[i].clear();
}

int main()
{

    string s; 
    cin >> s;

    solve_all_not_same(s);

    return 0;
}

Upvotes: 0

Views: 824

Answers (1)

greybeard
greybeard

Reputation: 2467

Faced with problem X (find all palindrome substrings), you ask how to solve Y (hash substrings quickly): The XY Problem.
For palindrome detection, consider suffix arrays (one for the reverse of the input, or that appended to the input).
For fast hashes of overlapping strings, look into rolling hashes.

Upvotes: 2

Related Questions