How to find most commonly occurring non-unique keys in Boost MultiIndex?

Question

Boost MultiIndex Container, when defined to have hashed_non_unique keys, can group equivalent keys together and return them all against an equal_range query, as mentioned here. But I see no way of querying the largest range (or n largest ranges) in a set. Without comparing between the range sizes of distinct hashes, which can become computationally very expensive, is there a way to query the largest equal ranges?

If we consider a simple example, such as this one, I would like to query by frequency and get Tom as the first result, and then Jack and Leo in no particular order.

Joaqu&#237;n M L&#243;pez Mu&#241;oz · Accepted Answer

Ok, if you're using non-unique hashed indices, turns out equal_range does not invoke equality comparison for all the elements in the returned range (unlike common implementations of std::unordered_multimap, BTW), so the following can be very efficient:

template
std::multimap<
  std::size_t,
  std::reference_wrapper,
  std::greater
> group_sizes(const HashIndex& i)
{
  decltype(group_sizes(i)) res;
  for(auto it=i.begin(),end=i.end();it!=end;){
    auto next=i.equal_range(*it).second;
    res.emplace((std::size_t)std::distance(it,next),*it);
    it=next;
  }
  return res;
}

To check how efficient this actually is, let's try instrumenting the element type:

Live Coliru Demo

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

template
std::multimap<
  std::size_t,
  std::reference_wrapper,
  std::greater
> group_sizes(const HashIndex& i)
{
  decltype(group_sizes(i)) res;
  for(auto it=i.begin(),end=i.end();it!=end;){
    auto next=i.equal_range(*it).second;
    res.emplace((std::size_t)std::distance(it,next),*it);
    it=next;
  }
  return res;
}

struct instrumented_string:std::string
{
  using std::string::string;
  
  static void reset_nums()
  {
    num_hashes=0;
    num_eqs=0;
  }
  
  static std::size_t num_hashes,num_eqs;
};

std::size_t instrumented_string::num_hashes=0;
std::size_t instrumented_string::num_eqs=0;

bool operator==(const instrumented_string& x,const instrumented_string& y)
{
  ++instrumented_string::num_eqs;
  return static_cast(x)==y;
}

std::size_t hash_value(const instrumented_string& x)
{
  ++instrumented_string::num_hashes;
  return boost::hash{}(x);
}

using namespace boost::multi_index;
using container=multi_index_container<
  instrumented_string,
  indexed_by<
    hashed_non_unique>
  >
>;

int main()
{
  auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
  container c;
  for(auto& v:values){
    for(auto i=100*std::strlen(v);i--;)c.insert(v);
  }
  
  instrumented_string::reset_nums();
  
  auto gs=group_sizes(c);
  for(const auto& g:gs){
    std::cout<


Output
800: Subhamoy
600: Bjarne
400: Jack
300: Tom
300: Leo
# hashes: 5
# eqs: 5

So, for a container with 2,400 elements, invoking group_sizes has resulted in just 5 hash calculations and 5 equality comparisons (plus ~2,400 iterator increments, of course).
If you really want to get rid of hashes, the following can do:
Live Coliru Demo
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

template
struct internal_reference
{
  const HashIndex&                      i;
  const typename HashIndex::value_type& r;
  std::size_t                           buc;
};

template
struct internal_reference_equal_to
{
  bool operator()(
    const typename HashIndex::value_type& x,
    const internal_reference& y)const
  {
    return
      std::addressof(x)==std::addressof(y.r)||
      y.i.key_eq()(y.i.key_extractor()(x),y.i.key_extractor()(y.r));
  }

  bool operator()(
    const internal_reference& x,
    const typename HashIndex::value_type& y)const
  {
    return (*this)(y,x);
  }
};

template
struct internal_reference_hash
{
  std::size_t operator()(const internal_reference& x)const
  {
    return x.buc;
  }
};

template
std::multimap<
  std::size_t,
  std::reference_wrapper,
  std::greater
> group_sizes(const HashIndex& i)
{
  decltype(group_sizes(i)) res;
  for(std::size_t buc=0,buc_count=i.bucket_count();buc{i,*it,buc},
                      internal_reference_hash{},
                      internal_reference_equal_to{});
      std::size_t dist=0;
      auto        next=it;
      while(p.first!=p.second){
        ++p.first;
        ++dist;
        ++next;
      }
      res.emplace(dist,*it);
      it=next;
    }
  }
  return res;
}

struct instrumented_string:std::string
{
  using std::string::string;
  
  static void reset_nums()
  {
    num_hashes=0;
    num_eqs=0;
  }
  
  static std::size_t num_hashes,num_eqs;
};

std::size_t instrumented_string::num_hashes=0;
std::size_t instrumented_string::num_eqs=0;

bool operator==(const instrumented_string& x,const instrumented_string& y)
{
  ++instrumented_string::num_eqs;
  return static_cast(x)==y;
}

std::size_t hash_value(const instrumented_string& x)
{
  ++instrumented_string::num_hashes;
  return boost::hash{}(x);
}

using namespace boost::multi_index;
using container=multi_index_container<
  instrumented_string,
  indexed_by<
    hashed_non_unique>
  >
>;

int main()
{
  auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
  container c;
  for(auto& v:values){
    for(auto i=100*std::strlen(v);i--;)c.insert(v);
  }
  
  instrumented_string::reset_nums();
  
  auto gs=group_sizes(c);
  for(const auto& g:gs){
    std::cout<

Output
800: Subhamoy
600: Bjarne
400: Jack
300: Tom
300: Leo
# hashes: 0
# eqs: 0

But please bear in mind this version of group_sizes exploits the undocumented fact that elements with hash value h get placed in the bucket h%bucket_count() (or, put another way, internal_reference hashing is technically not a conformant compatible extension of the index hash function).

How to find most commonly occurring non-unique keys in Boost MultiIndex?

Answers (2)

Approach 1: BMI + RangeV3 for syntactic sugar

Approach 2: The same, but w/o Boost

Related Questions