Reputation: 278
I was not satisfied with the performance of the below thrust::reduce_by_key
, so I rewrote it in a variety of ways with little gained benefit (including removing the permutation iterator). However, it wasn't until after replacing it with a thrust::for_each()
(see below) that capitalizes on atomicAdd()
, that I gained almost a 75x speedup! The two versions produce the exact same results. What could be the biggest cause for the dramatic performance differences?
Complete code for comparison between the two approaches:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <ctime>
#include <iostream>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/sort.h>
constexpr int NumberOfOscillators = 100;
int SeedRange = 500;
struct GetProduct
{
template<typename Tuple>
__host__ __device__
int operator()(const Tuple & t)
{
return thrust::get<0>(t) * thrust::get<1>(t);
}
};
int main()
{
using namespace std;
using namespace thrust::placeholders;
/* BEGIN INITIALIZATION */
thrust::device_vector<int> dv_OscillatorsVelocity(NumberOfOscillators);
thrust::device_vector<int> dv_outputCompare(NumberOfOscillators);
thrust::device_vector<int> dv_Connections_Strength((NumberOfOscillators - 1) * NumberOfOscillators);
thrust::device_vector<int> dv_Connections_Active((NumberOfOscillators - 1) * NumberOfOscillators);
thrust::device_vector<int> dv_Connections_TerminalOscillatorID_Map(0);
thrust::device_vector<int> dv_Permutation_Connections_To_TerminalOscillators((NumberOfOscillators - 1) * NumberOfOscillators);
thrust::device_vector<int> dv_Connection_Keys((NumberOfOscillators - 1) * NumberOfOscillators);
srand((unsigned int)time(NULL));
thrust::fill(dv_OscillatorsVelocity.begin(), dv_OscillatorsVelocity.end(), 0);
for (int c = 0; c < NumberOfOscillators * (NumberOfOscillators - 1); c++)
{
dv_Connections_Strength[c] = (rand() % SeedRange) - (SeedRange / 2);
dv_Connections_Active[c] = 0;
}
int curOscillatorIndx = -1;
for (int c = 0; c < NumberOfOscillators * NumberOfOscillators; c++)
{
if (c % NumberOfOscillators == 0)
{
curOscillatorIndx++;
}
if (c % NumberOfOscillators != curOscillatorIndx)
{
dv_Connections_TerminalOscillatorID_Map.push_back(c % NumberOfOscillators);
}
}
for (int n = 0; n < NumberOfOscillators; n++)
{
for (int p = 0; p < NumberOfOscillators - 1; p++)
{
thrust::copy_if(
thrust::device,
thrust::make_counting_iterator<int>(0),
thrust::make_counting_iterator<int>(dv_Connections_TerminalOscillatorID_Map.size()), // indices from 0 to N
dv_Connections_TerminalOscillatorID_Map.begin(), // array data
dv_Permutation_Connections_To_TerminalOscillators.begin() + (n * (NumberOfOscillators - 1)), // result will be written here
_1 == n);
}
}
for (int c = 0; c < NumberOfOscillators * (NumberOfOscillators - 1); c++)
{
dv_Connection_Keys[c] = c / (NumberOfOscillators - 1);
}
/* END INITIALIZATION */
/* BEGIN COMPARISON */
auto t = clock();
for (int x = 0; x < 5000; ++x) //Set x maximum to a reasonable number while testing performance.
{
thrust::reduce_by_key(
thrust::device,
//dv_Connection_Keys = 0,0,0,...1,1,1,...2,2,2,...3,3,3...
dv_Connection_Keys.begin(), //keys_first The beginning of the input key range.
dv_Connection_Keys.end(), //keys_last The end of the input key range.
thrust::make_permutation_iterator(
thrust::make_transform_iterator(
thrust::make_zip_iterator(
thrust::make_tuple(
dv_Connections_Strength.begin(),
dv_Connections_Active.begin()
)
),
GetProduct()
),
dv_Permutation_Connections_To_TerminalOscillators.begin()
), //values_first The beginning of the input value range.
thrust::make_discard_iterator(), //keys_output The beginning of the output key range.
dv_OscillatorsVelocity.begin() //values_output The beginning of the output value range.
);
}
std::cout << "iterations time for original: " << (clock() - t) * (1000.0 / CLOCKS_PER_SEC) << "ms\n" << endl << endl;
thrust::copy(dv_OscillatorsVelocity.begin(), dv_OscillatorsVelocity.end(), dv_outputCompare.begin());
t = clock();
for (int x = 0; x < 5000; ++x) //Set x maximum to a reasonable number while testing performance.
{
thrust::for_each(
thrust::device,
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(0) + dv_Connections_Active.size(),
[
s = dv_OscillatorsVelocity.size() - 1,
dv_b = thrust::raw_pointer_cast(dv_OscillatorsVelocity.data()),
dv_c = thrust::raw_pointer_cast(dv_Permutation_Connections_To_TerminalOscillators.data()), //3,6,9,0,7,10,1,4,11,2,5,8
dv_ppa = thrust::raw_pointer_cast(dv_Connections_Active.data()),
dv_pps = thrust::raw_pointer_cast(dv_Connections_Strength.data())
] __device__(int i) {
const int readIndex = i / s;
atomicAdd(
dv_b + readIndex,
(dv_ppa[dv_c[i]] * dv_pps[dv_c[i]])
);
}
);
}
std::cout << "iterations time for new: " << (clock() - t) * (1000.0 / CLOCKS_PER_SEC) << "ms\n" << endl << endl;
std::cout << "***" << (dv_OscillatorsVelocity == dv_outputCompare ? "success" : "fail") << "***\n";
/* END COMPARISON */
return 0;
}
Extra info.:
My results are using a single GTX 980 TI.
There are 100 * (100 - 1) = 9,900 elements in all of the "Connection" vectors.
Each of the 100 unique keys found in dv_Connection_Keys has 99 elements each.
Use this compiler option: --expt-extended-lambda
Upvotes: 1
Views: 556
Reputation: 3095
The following answer tries to explain or at least motivate the remaining difference in performance after going from a debug build to a release build as explained in Robert Crovella's answer.
As the accesses in both kernels are not coalesced due to the permutation_iterator
/indirection through dv_c
, going by the the plain number of accesses will overestimate the performance in this case. thrust::reduce_by_key
(or pretty much all Thrust algorithms) is not and can not be optimized for general permutations of the input as the performance of these bandwidth-bound kernels depends strongly on coalesced memory access. Naturally the algorithms are written such that accesses are coalesced for normal continuous input. So if you need to access the permuted state order of the data more than once (which might happen in a single reduction algorithm), it could be faster to actually permute the data in memory using thrust::gather
or thrust::scatter
once so at least all following accesses are efficient. I would not expect the for_each
solution to beat reduce_by_key
without that permutation.
Newer versions of nvcc
will try to use automatically use warp-aggregated-atomics to reduce the number of actual atomic instructions on the same address. As neighboring threads (same warp) tend to atomically write to the same address, this optimization is crucial for the performance of your custom reduction. Another important detail is that s = NumberOfOscillators
is relatively small (100
) in your code compared to typical thread-block sizes (256
, 512
, 1024
; locality of atomic writes) and the amount of parallelism in the for_each
(~NumberOfOscillators
^2). So for smaller NumberOfOscillators
I expect your custom reduction to get worse than reduce_by_key
due to the vanishing amount of parallelism, while for bigger NumberOfOscillators
you get both much more parallelism and more thread blocks/warps writing to the same location, so it is not quite clear which one will win without benchmarking it for given hardware and compiler.
Upvotes: 2
Reputation: 152164
What could be the biggest cause for the dramatic performance differences?
You are evidently building a debug project, that is your compilation settings include the -G
switch. Although you were asked for your compilation settings in the comments, you didn't mention this.
It's important.
CUDA device code can have dramatically different performance characteristics when compiled with -G
.
Don't evaluate performance of a debug project, or code compiled with -G
.
When I compile and run your code without -G
, I get:
iterations time for original: 210ms
iterations time for new: 70ms
***success***
When I compile your code with the debug switch -G
, and run, I get:
iterations time for original: 12330ms
iterations time for new: 320ms
***success***
returning to your question, that accounts for the biggest factor of the difference.
Upvotes: 3