NoBackingDown
NoBackingDown

Reputation: 2194

Fastest way to determine whether elements of a vector y occur in a vector x

I have the following problem: I have two vectors x and y of type double that are increasingly sorted and I would like to obtain a vector z indicating whether an element of y is present in x. Up to now, I have used std::binary_search in a for-loop as illustrated below, but I think there should be a faster way making use of the fact that also x is sorted? The issue is that this needs to be super fast as it turns out to be the bottleneck in my code.

For those familiar with R, I need an equivalent to match(y, x, nomatch = 0L) > 0L.

#include <iostream>
#include <algorithm>
#include <vector>

int main() {

    using namespace std;

    vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    vector<bool> z(y.size());
    for (int i = 0; i != y.size(); ++i)
        z[i] = binary_search(x.begin(), x.end(), y[i]);

    for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
        cout << *i << " ";

    return 0;
}

EDIT

Here are representative sample data for my problem:

#include <iostream>
#include <algorithm>
#include <vector>
#include <cstdlib>
#include <ctime>

// function generator:
double RandomNumber () { return (std::rand() / 10e+7); }

int main() {

    using namespace std;
    std::srand ( unsigned ( std::time(0) ) );

    // 5000 is representative
    int n = 5000;

    std::vector<double> x (n);
    std::generate (x.begin(), x.end(), RandomNumber);

    std::vector<double> y (n);
    std::generate (y.begin(), y.end(), RandomNumber);

    for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
    y.push_back(*i);
}

    std::sort(x.begin(), x.end());
    std::sort(y.begin(), y.end());

    return 0;
}

Upvotes: 4

Views: 789

Answers (8)

FrankS101
FrankS101

Reputation: 2135

I picked up all your codes, Dieter timing sample and the sample data of 5000 random doubles of the OP to perform a more complete timing of all the alternatives. This is the code:

#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>
#include <iterator>
#include <cstdlib>
#include <ctime>
#include <assert.h>
#include <deque>
#include <functional>
#include <memory>

using namespace std;

double RandomNumber () { return (std::rand() / 10e+7); }

template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
    std::vector<std::reference_wrapper<const T>> z {};
    z.reserve(std::min(y.size(), x.size()));

    std::set_intersection(y.cbegin(), y.cend(),
                          x.cbegin(), x.cend(),
                          std::back_inserter(z));

    std::deque<bool> result(y.size(), false);

    for (const auto& e : z) {
        result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
    }

    return result;
}

int main() {

    const int NTESTS = 10;

    long long time1 = 0;
    long long time2 = 0;
    long long time3 = 0;
    long long time3_prime = 0;
    long long time4 = 0;
    long long time5 = 0;
    long long time6 = 0;

    for (int i = 0; i < NTESTS; ++i){

        std::srand ( unsigned ( std::time(0) ) );

        // 5000 is representative
        int n = 5000;

        std::vector<double> x (n);
        std::generate (x.begin(), x.end(), RandomNumber);

        std::vector<double> y (n);
        std::generate (y.begin(), y.end(), RandomNumber);

        for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
            y.push_back(*i);
        }

        std::sort(x.begin(), x.end());
        std::sort(y.begin(), y.end());

        vector<bool> z1(y.size());
        vector<unsigned char> z2(y.size());
        vector<unsigned char> z3(y.size());
        std::deque<bool> z3_prime;
        vector<bool> z4(y.size());
        std::vector<bool> z5(y.size());
        std::vector<bool> z6(y.size());

        // Original
        {
            auto start = std::chrono::high_resolution_clock::now();

            for (size_t i = 0; i != y.size(); ++i) {
                z1[i] = binary_search(x.begin(), x.end(), y[i]);
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time1 += duration.count();
        }

        // Original (replacing vector<bool> by vector<unsigned char>)
        {
            auto start = std::chrono::high_resolution_clock::now();

            for (size_t i = 0; i != y.size(); ++i) {
                z2[i] = binary_search(x.begin(), x.end(), y[i]);
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time2 += duration.count();
        }

        {  // Dieter Lücking set_intersection
            auto start = std::chrono::high_resolution_clock::now();

            size_t ix = 0;
            size_t iy = 0;
            while(ix < x.size() && iy < y.size())
            {
                if(x[ix] < y[iy]) ++ix;
                else if(y[iy] < x[ix]) ++iy;
                else {
                    z3[iy] = 1;
                    // ++ix; Not this if one vector is not uniquely sorted
                    ++iy;
                }
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time3 += duration.count();
        }

        // Std::set_intersection
        {
            auto start = std::chrono::high_resolution_clock::now();

            z3_prime = match(y, x);

            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time3_prime += duration.count();
        }

        { // Ed Heal
            auto start = std::chrono::high_resolution_clock::now();

            int i_x = 0, i_y = 0;
            while (i_x < x.size() && i_y < y.size())
            {
                if (x[i_x] == y[i_y]) {
                    //cout << "In both" << x[i_x] << endl;
                    z4[i_y] = true;
                    ++i_x;
                    ++i_y;
                } else if (x[i_x] < y[i_y]) {
                    ++i_x;
                } else {
                    z4[i_y] = false;
                    ++i_y;
                }
            }

           /* for (; i_y < y.size(); ++i_y) {
                //Empty
            } */
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time4 += duration.count();
        }

        { //  JacquesdeHooge
            auto start = std::chrono::high_resolution_clock::now();
            auto it_x = x.begin();
            int i = 0;
            for (; i < (int)y.size(); ++i) {
                it_x = std::lower_bound(it_x, x.end(), y[i]);
                if (it_x == x.end()) break;
                z5[i] = *it_x == y[i];
            }
            std::fill(z5.begin() + i, z5.end(), false);
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time5 += duration.count();
        }

        { // Skizz
            auto start = std::chrono::high_resolution_clock::now();
            vector<double>::iterator a = x.begin(), b = y.begin();
            int i = 0;
            while (a != x.end () && b != y.end ())
            {
                if (*a == *b) {
                    z6[i] = true;
                    ++a;
                    ++b;
                }
                else
                {
                    z6[i] = false;
                    if (*a < *b)
                    {
                        ++a;
                    }
                    else
                    {
                        ++b;
                    }
                }
                i++;
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time6 += duration.count();
        }

        assert (std::equal(z1.begin(), z1.begin() + 5000, z2.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z3.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z3_prime.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z4.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z5.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z6.begin()));
    }

    cout << "Original - vector<bool>: \t\t" << time1 << " ns\n";
    cout << "Original - vector<unsigned char>: \t" << time2 << " ns\n";
    cout << "Set intersection (Daniel): \t\t" << time3_prime << " ns\n";
    cout << "Set intersection (Dieter Lücking): \t" << time3 << " ns\n";
    cout << "Ed Heal: \t\t\t\t" << time4 << " ns\n";
    cout << "JackesdeHooge: \t\t\t\t" << time5 << " ns\n";
    cout << "Skizz: \t\t\t\t\t" << time6 << " ns\n";
    cout << endl;

    return 0;
}

My results with g++ 5.2.1 -std::c++11 and -O3:

Original - vector: 10152069 ns

Original - vector: 8686619 ns

Set intersection (Daniel): 1768855 ns

Set intersection (Dieter Lücking): 1617106 ns

Ed Heal: 1446596 ns

JackesdeHooge: 3998958 ns

Skizz: 1385193 ns

*Please note Ed Heal and Skizz solutions are essentially the same.

Upvotes: 3

Daniel
Daniel

Reputation: 8441

You can use std::set_itersection:

#include <vector>
#include <algorithm>
#include <iterator>
#include <iostream>

int main()
{
    std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    std::vector<double> z {};

    std::set_intersection(std::cbegin(x), std::cend(x), 
                          std::cbegin(y), std::cend(y), 
                          std::back_inserter(z));

   std::copy(std::cbegin(z), std::cend(z),
             std::ostream_iterator<double> {std::cout, " "});
}

Edit

To address Dieter Lücking point in the comments, here is a version that more closely matches R's match function:

#include <vector>
#include <deque>
#include <algorithm>
#include <iterator>
#include <functional>
#include <memory>
#include <iostream>

template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
    std::vector<std::reference_wrapper<const T>> z {};
    z.reserve(std::min(y.size(), x.size()));

    std::set_intersection(std::cbegin(y), std::cend(y),
                          std::cbegin(x), std::cend(x),
                          std::back_inserter(z));

    std::deque<bool> result(y.size(), false);

    for (const auto& e : z) {
        result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
    }

    return result;
}

int main()
{
    std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    const auto matches = match(y, x);

    std::copy(std::cbegin(matches), std::cend(matches),
              std::ostream_iterator<bool> {std::cout});
}

Upvotes: 6

user2249683
user2249683

Reputation:

Just a timing of binary search and set intersection with the improvement of using std::vector:

#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>

int main() {
    using namespace std;

    // Original
    {
        vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
        vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

        auto start = std::chrono::high_resolution_clock::now();
        vector<bool> z(y.size());
        for (size_t i = 0; i != y.size(); ++i)
            z[i] = binary_search(x.begin(), x.end(), y[i]);
        auto stop = std::chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
        cout << "vector<bool>: " << duration.count() << "ns\n";
        for (auto i = z.begin(); i != z.end(); ++i)
            cout << unsigned(*i) << " ";
        cout << '\n';
    }

    // Original (replacing vector<bool> by vector<unsigned char>)
    {
        vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
        vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

        auto start = std::chrono::high_resolution_clock::now();
        vector<unsigned char> z(y.size());
        for (size_t i = 0; i != y.size(); ++i)
            z[i] = binary_search(x.begin(), x.end(), y[i]);
        auto stop = std::chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
        cout << "vector<unsigned char>: " << duration.count() << "ns\n";
        for (auto i = z.begin(); i != z.end(); ++i)
            cout << unsigned(*i) << " ";
        cout << '\n';
    }

    // Similar to std::set_intersection
    {
        vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
        vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

        auto start = std::chrono::high_resolution_clock::now();
        vector<unsigned char> z(y.size());
        size_t ix = 0;
        size_t iy = 0;
        while(ix < x.size() && iy < y.size())
        {
            if(x[ix] < y[iy]) ++ix;
            else if(y[iy] < x[ix]) ++iy;
            else {
                z[iy] = 1;
                // ++ix; Not this if one vector is not uniquely sorted
                ++iy;
            }
        }
        auto stop = std::chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
        cout << "set intersection: " << duration.count() << "ns\n";
        for (auto i = z.begin(); i != z.end(); ++i)
            cout << unsigned(*i) << " ";
        cout << '\n';
    }
    return 0;
}

Compiled with g++ -std=c++11 -O3 (g++ 4.84) gives:

vector<bool>: 3622ns
0 0 1 0 1 0 1 1 0 
vector<unsigned char>: 1635ns
0 0 1 0 1 0 1 1 0 
set intersection: 1299ns
0 0 1 0 1 0 1 1 0 

Upvotes: 1

n. m. could be an AI
n. m. could be an AI

Reputation: 119857

When you have found an element (or a place in the array the element would have been), you don't need to consider elements that occur before that any more. So use the result of the previous find instead of x.begin().

Since std::binary_search does not return an iterator, use std::lower_bound instead. Also consider std::find (yes linear search, it might be actually faster, depending on your data).

If this doesn't bring enough improvement, try std::unordered_set instead of an array.

Upvotes: 1

Lingxi
Lingxi

Reputation: 14967

An implementation of @JacquesdeHooge's answer:

std::vector<bool> ComputeMatchFlags(const std::vector<double>& x,
                                    const std::vector<double>& y) {
  std::vector<bool> found(y.size());
  auto it_x = x.begin();
  int i = 0;
  for (; i < (int)y.size(); ++i) {
    it_x = std::lower_bound(it_x, x.end(), y[i]);
    if (it_x == x.end()) break;
    found[i] = *it_x == y[i];
  }
  std::fill(found.begin() + i, found.end(), false);
  return found;
}

Upvotes: 1

Ed Heal
Ed Heal

Reputation: 59997

Perhaps this algorithm will be better as the two vectors are sorted. The time complexity is linear.

#include <iostream>
#include <algorithm>
#include <vector>

int main() {

    using namespace std;

    vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    vector<bool> z(y.size());
    int i_x = 0, i_y = 0;
    while (i_x < x.size() && i_y < y.size())
    {
        if (x[i_x] == y[i_y]) {
            cout << "In both" << x[i_x] << endl;
            z[i_y] = true;
            ++i_x;
            ++i_y;
        } else if (x[i_x] < y[i_y]) {
            ++i_x;
        } else {
            z[i_y] = false;
            ++i_y;
        }
    }

    for (; i_y < y.size(); ++i_y) {
        //Empty
    }    
    for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
        cout << *i << " ";

    return 0;
}

Upvotes: 1

Skizz
Skizz

Reputation: 71060

Off the top of my head, I can only think of this:-

vector<double>::iterator a = x.begin(), b = y.begin();

while (a != x.end () && b != y.end ())
{
  if (*a == *b)
  {
     // value is in both containers
     ++a;
  }
  else
  {
    if (*a < *b)
    {
      ++a;
    }
    else
    {
      ++b;
    }
  }
}

Upvotes: 2

Jacques de Hooge
Jacques de Hooge

Reputation: 6990

Since both vectors are sorted, you have to apply bin search only on the remainder part of the second vector.

So if you e.g. don't find x [i] in before y [j], you're certain you also won't find x [i + 1] before y [j]. In finding a match for x [i + 1] it therefore suffices to apply bin search starting with y [j].

Upvotes: 2

Related Questions