Subhamoy S.
Subhamoy S.

Reputation: 6764

How to find most commonly occurring non-unique keys in Boost MultiIndex?

Boost MultiIndex Container, when defined to have hashed_non_unique keys, can group equivalent keys together and return them all against an equal_range query, as mentioned here. But I see no way of querying the largest range (or n largest ranges) in a set. Without comparing between the range sizes of distinct hashes, which can become computationally very expensive, is there a way to query the largest equal ranges?

If we consider a simple example, such as this one, I would like to query by frequency and get Tom as the first result, and then Jack and Leo in no particular order.

Upvotes: 1

Views: 552

Answers (2)

Ok, if you're using non-unique hashed indices, turns out equal_range does not invoke equality comparison for all the elements in the returned range (unlike common implementations of std::unordered_multimap, BTW), so the following can be very efficient:

template<typename HashIndex>
std::multimap<
  std::size_t,
  std::reference_wrapper<const typename HashIndex::value_type>,
  std::greater<std::size_t>
> group_sizes(const HashIndex& i)
{
  decltype(group_sizes(i)) res;
  for(auto it=i.begin(),end=i.end();it!=end;){
    auto next=i.equal_range(*it).second;
    res.emplace((std::size_t)std::distance(it,next),*it);
    it=next;
  }
  return res;
}

To check how efficient this actually is, let's try instrumenting the element type:

Live Coliru Demo

#include <boost/multi_index_container.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/identity.hpp>
#include <cstring>
#include <functional>
#include <iostream>
#include <string>
#include <tuple>
#include <map>

template<typename HashIndex>
std::multimap<
  std::size_t,
  std::reference_wrapper<const typename HashIndex::value_type>,
  std::greater<std::size_t>
> group_sizes(const HashIndex& i)
{
  decltype(group_sizes(i)) res;
  for(auto it=i.begin(),end=i.end();it!=end;){
    auto next=i.equal_range(*it).second;
    res.emplace((std::size_t)std::distance(it,next),*it);
    it=next;
  }
  return res;
}

struct instrumented_string:std::string
{
  using std::string::string;
  
  static void reset_nums()
  {
    num_hashes=0;
    num_eqs=0;
  }
  
  static std::size_t num_hashes,num_eqs;
};

std::size_t instrumented_string::num_hashes=0;
std::size_t instrumented_string::num_eqs=0;

bool operator==(const instrumented_string& x,const instrumented_string& y)
{
  ++instrumented_string::num_eqs;
  return static_cast<std::string>(x)==y;
}

std::size_t hash_value(const instrumented_string& x)
{
  ++instrumented_string::num_hashes;
  return boost::hash<std::string>{}(x);
}

using namespace boost::multi_index;
using container=multi_index_container<
  instrumented_string,
  indexed_by<
    hashed_non_unique<identity<instrumented_string>>
  >
>;

int main()
{
  auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
  container c;
  for(auto& v:values){
    for(auto i=100*std::strlen(v);i--;)c.insert(v);
  }
  
  instrumented_string::reset_nums();
  
  auto gs=group_sizes(c);
  for(const auto& g:gs){
    std::cout<<g.first<<": "<<g.second.get()<<"\n";
  }
  
  std::cout<<"# hashes: "<<instrumented_string::num_hashes<<"\n";
  std::cout<<"# eqs: "<<instrumented_string::num_eqs<<"\n";
}

Output

800: Subhamoy
600: Bjarne
400: Jack
300: Tom
300: Leo
# hashes: 5
# eqs: 5

So, for a container with 2,400 elements, invoking group_sizes has resulted in just 5 hash calculations and 5 equality comparisons (plus ~2,400 iterator increments, of course).

If you really want to get rid of hashes, the following can do:

Live Coliru Demo

#include <boost/multi_index_container.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/identity.hpp>
#include <cstring>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <map>

template<typename HashIndex>
struct internal_reference
{
  const HashIndex&                      i;
  const typename HashIndex::value_type& r;
  std::size_t                           buc;
};

template<typename HashIndex>
struct internal_reference_equal_to
{
  bool operator()(
    const typename HashIndex::value_type& x,
    const internal_reference<HashIndex>& y)const
  {
    return
      std::addressof(x)==std::addressof(y.r)||
      y.i.key_eq()(y.i.key_extractor()(x),y.i.key_extractor()(y.r));
  }

  bool operator()(
    const internal_reference<HashIndex>& x,
    const typename HashIndex::value_type& y)const
  {
    return (*this)(y,x);
  }
};

template<typename HashIndex>
struct internal_reference_hash
{
  std::size_t operator()(const internal_reference<HashIndex>& x)const
  {
    return x.buc;
  }
};

template<typename HashIndex>
std::multimap<
  std::size_t,
  std::reference_wrapper<const typename HashIndex::value_type>,
  std::greater<std::size_t>
> group_sizes(const HashIndex& i)
{
  decltype(group_sizes(i)) res;
  for(std::size_t buc=0,buc_count=i.bucket_count();buc<buc_count;++buc){
    for(auto it=i.begin(buc),end=i.end(buc);it!=end;){
      auto        p=i.equal_range(
                      internal_reference<HashIndex>{i,*it,buc},
                      internal_reference_hash<HashIndex>{},
                      internal_reference_equal_to<HashIndex>{});
      std::size_t dist=0;
      auto        next=it;
      while(p.first!=p.second){
        ++p.first;
        ++dist;
        ++next;
      }
      res.emplace(dist,*it);
      it=next;
    }
  }
  return res;
}

struct instrumented_string:std::string
{
  using std::string::string;
  
  static void reset_nums()
  {
    num_hashes=0;
    num_eqs=0;
  }
  
  static std::size_t num_hashes,num_eqs;
};

std::size_t instrumented_string::num_hashes=0;
std::size_t instrumented_string::num_eqs=0;

bool operator==(const instrumented_string& x,const instrumented_string& y)
{
  ++instrumented_string::num_eqs;
  return static_cast<std::string>(x)==y;
}

std::size_t hash_value(const instrumented_string& x)
{
  ++instrumented_string::num_hashes;
  return boost::hash<std::string>{}(x);
}

using namespace boost::multi_index;
using container=multi_index_container<
  instrumented_string,
  indexed_by<
    hashed_non_unique<identity<instrumented_string>>
  >
>;

int main()
{
  auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
  container c;
  for(auto& v:values){
    for(auto i=100*std::strlen(v);i--;)c.insert(v);
  }
  
  instrumented_string::reset_nums();
  
  auto gs=group_sizes(c);
  for(const auto& g:gs){
    std::cout<<g.first<<": "<<g.second.get()<<"\n";
  }
  
  std::cout<<"# hashes: "<<instrumented_string::num_hashes<<"\n";
  std::cout<<"# eqs: "<<instrumented_string::num_eqs<<"\n";
}

Output

800: Subhamoy
600: Bjarne
400: Jack
300: Tom
300: Leo
# hashes: 0
# eqs: 0

But please bear in mind this version of group_sizes exploits the undocumented fact that elements with hash value h get placed in the bucket h%bucket_count() (or, put another way, internal_reference<HashIndex> hashing is technically not a conformant compatible extension of the index hash function).

Upvotes: 2

sehe
sehe

Reputation: 392911

It seems like you might be metter served with a std::map<K, std::vector<V> > like interface here.

You would still always have to do the counting.

To have the counting done "magically" you might consider making the "bucket key" a refcounting type.

This would be more magical than I'd be comfortable with for my code-bases. In particular, copied elements could easily cause overcounting.

Approach 1: BMI + RangeV3 for syntactic sugar

Warning: I consider this "advanced", as in the learning curve might be steepish. However, when you wield Ranges with ease, this can become a great productivity boost.

Note also, this does not in any way promise to increase performance. But you should note that no elements are copied, the vector (groups) merely contains subranges, which are iterator ranges into the multi-index container.

Live On Compiler Explorer

#include <boost/multi_index/composite_key.hpp>
#include <boost/multi_index/member.hpp>
#include <boost/multi_index/ordered_index.hpp>
#include <boost/multi_index_container.hpp>
#include <iostream>
#include <iomanip>
#include <range/v3/all.hpp>
#include <fmt/ranges.h>
#include <fmt/ostream.h>

namespace bmi = boost::multi_index;
namespace vw  = ranges::views;
namespace act = ranges::actions;

struct Person {
    int m_id;
    std::string m_name;

    friend std::ostream& operator<<(std::ostream& os, Person const& p) {
        return os << "[" << p.m_id << ", " << std::quoted(p.m_name) << "]";
    }
};

typedef bmi::multi_index_container<
    Person,
    bmi::indexed_by<
        bmi::ordered_unique<bmi::member<Person, int, &Person::m_id>>,
        bmi::ordered_unique<
            bmi::tag<struct by_name_id>,
            bmi::composite_key<Person,
                bmi::member<Person, std::string, &Person::m_name>,
                bmi::member<Person, int, &Person::m_id>>
        >
    > >
    Roster;

template <typename Index, typename KeyExtractor>
std::size_t distinct(const Index& i, KeyExtractor key) {
    std::size_t res = 0;
    for (auto it = i.begin(), it_end = i.end(); it != it_end;) {
        ++res;
        it = i.upper_bound(key(*it));
    }
    return res;
}

int main() {
    Roster const r {
        {1, "Tom"},
        {2, "Jack"},
        {3, "Tom"},
        {4, "Leo"}
    };
    fmt::print("Roster: {}\n", r);

    static constexpr auto eq_   = std::equal_to<>{};
    static constexpr auto name_ = std::mem_fn(&Person::m_name);
    static constexpr auto size_ = [](auto const& r) constexpr { return std::distance(begin(r), end(r)); };
    auto& idx = r.get<by_name_id>();
    fmt::print("Distinct: {}, Index: {}\n", distinct(idx, name_), idx);

    auto by_name_ = vw::group_by([](auto const&... arg) { return eq_(name_(arg)...); });
    auto by_size_ = [](auto const&... subrange) { return (size_(subrange) > ...); };

    auto groups = idx | by_name_ | ranges::to_vector;

    for (auto&& x : groups |= act::sort(by_size_)) {
        fmt::print("#{} persons in group {}: {}\n",
                size_(x),
                name_(ranges::front(x)),
                x);
    }
}

Prints:

Roster: {[1, "Tom"], [2, "Jack"], [3, "Tom"], [4, "Leo"]}
Distinct: 3, Index: {[2, "Jack"], [4, "Leo"], [1, "Tom"], [3, "Tom"]}
#2 persons in group Tom: {[1, "Tom"], [3, "Tom"]}
#1 persons in group Jack: {[2, "Jack"]}
#1 persons in group Leo: {[4, "Leo"]}

Note, I merely kept the distinct() function from the original link. You could drop it to remove some noise.

Approach 2: The same, but w/o Boost

Multi-index seems to be supplying nothing more than the ordered container now, so let's simplify:

Live On Compiler Explorer

#include <set>
#include <iostream>
#include <iomanip>
#include <range/v3/all.hpp>
#include <fmt/ranges.h>
#include <fmt/ostream.h>

namespace vw  = ranges::views;
namespace act = ranges::actions;

struct Person {
    int m_id;
    std::string m_name;

    friend std::ostream& operator<<(std::ostream& os, Person const& p) {
        return os << "[" << p.m_id << ", " << std::quoted(p.m_name) << "]";
    }

    bool operator<(Person const& o) const { return m_name < o.m_name; }
};

int main() {
    std::multiset<Person> const r {
        {1, "Tom"},
        {2, "Jack"},
        {3, "Tom"},
        {4, "Leo"}
    };
    fmt::print("Roster: {}\n", r);

    static constexpr auto eq_   = std::equal_to<>{};
    static constexpr auto name_ = std::mem_fn(&Person::m_name);
    static constexpr auto size_ = [](auto const& r) constexpr { return std::distance(begin(r), end(r)); };
    auto by_name_ = vw::group_by([](auto const&... arg) { return eq_(name_(arg)...); });
    auto by_size_ = [](auto const&... subrange) { return (size_(subrange) > ...); };

    auto groups = r | by_name_ | ranges::to_vector;

    for (auto&& x : groups |= act::sort(by_size_)) {
        fmt::print("#{} persons in group {}: {}\n",
                size_(x),
                name_(ranges::front(x)),
                x);
    }
}

Prints

Roster: {[2, "Jack"], [4, "Leo"], [1, "Tom"], [3, "Tom"]}
#2 persons in group Tom: {[1, "Tom"], [3, "Tom"]}
#1 persons in group Jack: {[2, "Jack"]}
#1 persons in group Leo: {[4, "Leo"]}

Bonus: Slightly more simplified assuming equality operator on Person suffices: https://godbolt.org/z/58xsTK

Upvotes: 2

Related Questions