Reputation: 21
I am trying to parallelize a piece of code that multiplies two vectors of complex floats and sums the result. To do this I am trying to use std::async with futures. My idea was to split the vector into 8 parts and perform the multiplication on each of these 8 parts in parallel before summing them for my final result. To do this I create 8 futures each containing a lambda that multiplies two vectors and sums the result. Each future is passed pointers to different positions of the vector which represents the section of the vector this particular future should act on.
However it does not seem to be giving me the speed ups I expected, it has maybe sped this section of the code up by 20-30% but that is it, in addition the load doesn't seem to spread across my cores (4 or 8 with hyperthreading) but rather seems to be all on one core which is at 100%.
I have included the code below. Any suggestions would be greatly appreciated.
size_t size = Input1.size()/8;
std::vector<std::future<complex<float> > > futures;
futures.reserve(8);
for(int i = 0; i<8; ++i)
{
futures.push_back(std::async( [](complex<float>* pos, complex<float>*pos2, size_t siz)
{
complex<float> resum(0,0);
for(int i = 0; i < siz; ++i)
resum += pos[i]*pos2[i];
return resum;
}, &Input1[i*size], &Input2[i*size], size));
}
complex<float> ResSum(0,0);
for(int i = 0; i < futures.size(); ++i)
ResSum += futures.at(i).get();
Upvotes: 2
Views: 1733
Reputation: 11002
It depends on how much data you throw at it.
In the following example 4096 entries will be faster with a simple loop. But with 1000*4096 entries the parallel version is faster.
So your results of 20-30% improvement probably just fell in between that range with the hardware in question.
Here is the test program I used.
The first run is the simple loop, the second is from the question and the third uses std::launch::async
.
Plain From With
loop question launch::async
First Second Third
166 1067 607
166 614 434
166 523 509
265993 94633 66231
182981 60594 69537
237767 65731 57256
Here is the live result.
#include <vector>
#include <thread>
#include <future>
#include <complex>
#include <string>
#include <iostream>
#include <chrono>
#include <random>
#include <ratio>
float get_random()
{
static std::default_random_engine e;
static std::uniform_real_distribution<> dis(0,1); // rage 0 - 1
return static_cast<float>(dis(e));
}
void do_tests(float val1, float val2, float val3, float val4, int multiplier)
{
{
std::vector<std::complex<float>> Input1(4096*multiplier,std::complex<float>{val1,val2});
std::vector<std::complex<float>> Input2(4096*multiplier,std::complex<float>{val3,val4});
std::complex<float> ResSum(0,0);
auto start{std::chrono::high_resolution_clock::now()};
size_t size = Input1.size();
for (int i=0; i<size; ++i) {
ResSum += Input1[i]*Input2[i];
}
auto end{std::chrono::high_resolution_clock::now()};
auto time_used{end-start};
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(time_used).count() << "\t\t";
}
{
std::vector<std::complex<float>> Input1(4096*multiplier,std::complex<float>{val1,val2});
std::vector<std::complex<float>> Input2(4096*multiplier,std::complex<float>{val3,val4});
std::complex<float> ResSum(0,0);
auto start{std::chrono::high_resolution_clock::now()};
size_t size = Input1.size()/8;
std::vector<std::future<std::complex<float>>> futures;
futures.reserve(8);
for (int i = 0; i<8; ++i) {
futures.push_back(
std::async(
[](std::complex<float>* pos,std::complex<float>*pos2,size_t siz) {
std::complex<float> resum(0,0);
for (int i = 0; i < siz; ++i)
resum += pos[i]*pos2[i];
return resum;
}
,&Input1[i*size],&Input2[i*size],size
)
);
}
for (int i = 0; i < futures.size(); ++i)
ResSum += futures.at(i).get();
auto end{std::chrono::high_resolution_clock::now()};
auto time_used{end-start};
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(time_used).count() << "\t\t";
}
{
std::vector<std::complex<float>> Input1(4096*multiplier,std::complex<float>{val1,val2});
std::vector<std::complex<float>> Input2(4096*multiplier,std::complex<float>{val3,val4});
std::complex<float> ResSum(0,0);
auto start{std::chrono::high_resolution_clock::now()};
size_t size = Input1.size()/8;
std::vector<std::future<std::complex<float>>> futures;
futures.reserve(8);
for (int i = 0; i<8; ++i) {
futures.push_back(
std::async(std::launch::async,
[](std::complex<float>* pos,std::complex<float>*pos2,size_t siz) {
std::complex<float> resum(0,0);
for (int i = 0; i < siz; ++i)
resum += pos[i]*pos2[i];
return resum;
}
,&Input1[i*size],&Input2[i*size],size
)
);
}
for (int i = 0; i < futures.size(); ++i)
ResSum += futures.at(i).get();
auto end{std::chrono::high_resolution_clock::now()};
auto time_used{end-start};
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(time_used).count() << "\t\t";
}
std::cout << '\n';
}
int main()
{
float val1{get_random()};
float val2{get_random()};
float val3{get_random()};
float val4{get_random()};
std::cout << "First\t\tSecond\t\tThird\n";
do_tests(val1, val2, val3, val4, 1);
do_tests(val1, val2, val3, val4, 1);
do_tests(val1, val2, val3, val4, 1);
do_tests(val1, val2, val3, val4, 1000);
do_tests(val1, val2, val3, val4, 1000);
do_tests(val1, val2, val3, val4, 1000);
}
Upvotes: 2
Reputation: 76305
As written, the call to std::async
gets the default launch policy of launch::any
, which allows running all the asyncs on a single thread. To insist on separate threads, pass launch::async
as the first argument in the call to std::async
.
Upvotes: 1