c++ : Is there a faster way to get the intersection of map/unordered_map?

Question

Is there a faster way to the following in c++, so that i can outperform the python's implementation?

Get intersection of two map/unordered_map keys
For these intersected keys, compute the pairwise difference between elements of their respective set/unordered_set Some info that might be useful:
hash_DICT1 has about O(10000)keys, and about O(10) elements in the each set.
hash_DICT2 has about O(1000)keys, and about O(1) elements in the each set.

For example:

    map > hash_DICT1;
        hash_DICT1[1] = {1,2,3};
        hash_DICT1[2] = {4,5,6};
    map > hash_DICT2;
        hash_DICT2[1] = {11,12,13};
        hash_DICT2[3] = {4,5,6};

    vector output_vector
        = GetPairDiff(hash_DICT1, hash_DICT2)
        = [11-1,12-1,13-1, 
           11-2,12-2,13-2, 
           11-3,12-3,13-3] // only hashkey=1 is intersect, so only compute pairwise difference of the respective set elements.
        = [10, 11, 12, 
            9, 10, 11, 
            8,  9, 10] // Note that i do want to keep duplicates, if any. Order does not matter.

GetPairDiff function.

    vector GetPairDiff(
    unordered_map > &hash_DICT1,
    unordered_map > &hash_DICT2) {
      // Init
        vector output_vector;
        int curr_key;
        set curr_set1, curr_set2;

      // Get intersection
        for (const auto &KEY_SET:hash_DICT2) {
          curr_key = KEY_SET.first;
          // Find pairwise difference
          if (hash_DICT1.count(curr_key) > 0){
            curr_set1 = hash_DICT1[curr_key];
            curr_set2 = hash_DICT2[curr_key];
            for (auto it1=curr_set1.begin(); it1 != curr_set1.end(); ++it1) {
              for (auto it2=curr_set2.begin(); it2 != curr_set2.end(); ++it2) {
                output_vector.push_back(*it2 - *it1);
              }
            }
          }
        }
    }

main run

    int main (int argc, char ** argv) {
        // Using unordered_map
        unordered_map > hash_DICT_1;
            hash_DICT_1[1] = {1,2,3};
            hash_DICT_1[2] = {4,5,6};
        unordered > hash_DICT_2;
            hash_DICT_2[1] = {11,12,13};
            hash_DICT_2[3] = {4,5,6};
        GetPairDiff(hash_DICT_1, hash_DICT_1);
    }

Compiled like this

g++ -o ./CompareRunTime.out -Ofast -Wall -Wextra -std=c++11

Other data structures are welcomed, such as map or unordered_set. However i did try all 4 permutations, and found the one given by GetPairDiff runs the fastest, but nowhere near as fast as the python's implementation:

hash_DICT1 = { 1 : {1,2,3},      2 : {4,5,6} }
hash_DICT2 = { 1 : {11,12,13},   3 : {4,5,6} }

def GetPairDiff(hash_DICT1, hash_DICT2):
    vector = []
    for element in hash_DICT1.keys() & hash_DICT2.keys():
        vector.extend(
            [db_t-qry_t 
            for qry_t in hash_DICT2[element] 
            for db_t in hash_DICT1[element] ])
    return vector

output_vector = GetPairDiff(hash_DICT1, hash_DICT2)

Performance comparison:

python  : 0.00824 s
c++     : 0.04286 s

The implementation by c++ takes about 5 times the time taken !!!

Ted Lyngmo · Accepted Answer

You do a lot of copying where you should be using const&.
You don't save search results. You should use find instead of count and then use the result.
push_back to a vector may be made faster by reserve()ing the number of elements you need to store if you know the number in advance.

Fixing these issues could result in something like this (requires C++17):

#include 
#include 
#include 
#include 
#include 

using container = std::unordered_map>;

std::vector GetPairDiff(const container& hash_DICT1,
                             const container& hash_DICT2) {
    // Init
    std::vector output_vector;

    // Get intersection
    for(auto& [curr_key2, curr_set2] : hash_DICT2) {
        // use find() instead of count()
        if(auto it1 = hash_DICT1.find(curr_key2); it1 != hash_DICT1.end()) {
            auto& curr_set1 = it1->second;

            // Reserve the space you know you'll need for this iteration. Note:
            // This might be a pessimizing optimization so try with and without it.
            output_vector.reserve(curr_set1.size() * curr_set2.size() +
                                  output_vector.size());

            // Calculate pairwise difference
            for(auto& s1v : curr_set1) {
                for(auto& s2v : curr_set2) {
                    output_vector.emplace_back(s2v - s1v);
                }
            }
        }
    }
    return output_vector;
}

int main() {
    container hash_DICT1{{1, {1, 2, 3}}, 
                         {2, {4, 5, 6}}};
    container hash_DICT2{{1, {11, 12, 13}},
                         {3, {4, 5, 6}}};

    auto result = GetPairDiff(hash_DICT1, hash_DICT2);

    for(int v : result) {
        std::cout << v << '
';
    }
}

This is more than 8 times as fast as the python version for these containers on my computer compiled with g++ -std=c++17 -O3.

Here's a C++11 version of the same program:

#include 
#include 
#include 
#include 
#include 

using container = std::unordered_map>;

std::vector GetPairDiff(const container& hash_DICT1,
                             const container& hash_DICT2) {
    // Init
    std::vector output_vector;

    // Get intersection
    for(auto& curr_pair2 : hash_DICT2) {
        auto& curr_key2 = curr_pair2.first;
        auto& curr_set2 = curr_pair2.second;
        // use find() instead of count()
        auto it1 = hash_DICT1.find(curr_key2);
        if(it1 != hash_DICT1.end()) {
            auto& curr_set1 = it1->second;

            // Reserve the space you know you'll need for this iteration. Note:
            // This might be a pessimizing optimization so try with and without it.
            output_vector.reserve(curr_set1.size() * curr_set2.size() +
                                  output_vector.size());

            // Calculate pairwise difference
            for(auto& s1v : curr_set1) {
                for(auto& s2v : curr_set2) {
                    output_vector.emplace_back(s2v - s1v);
                }
            }
        }
    }
    return output_vector;
}

int main() {
    container hash_DICT1{{1, {1, 2, 3}}, 
                         {2, {4, 5, 6}}};
    container hash_DICT2{{1, {11, 12, 13}},
                         {3, {4, 5, 6}}};

    auto result = GetPairDiff(hash_DICT1, hash_DICT2);

    for(int v : result) {
        std::cout << v << '
';
    }
}

c++ : Is there a faster way to get the intersection of map/unordered_map?

Answers (1)

Related Questions