Reputation: 21
I am trying to take a sum of numbers using thrust
with GK107 [GeForce GTX 650]
. I am confused to observe that the execution time for thrust::reduce
significantly increases just after initializing a device_vector<curandState>
on the memory.
The following is the sample code:
#include <iostream>
#include <stack>
#include <ctime>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <curand.h>
#include <curand_kernel.h>
struct tic_toc{
std::stack<clock_t> tictoc_stack;
inline void tic() { tictoc_stack.push(clock());}
inline void toc() {
std::cout << "Time elapsed: "
<< ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC << "s"
<< std::endl;
tictoc_stack.pop();
}
};
struct curand_setup{
using init_tuple = thrust::tuple<int, curandState &>;
const unsigned long long seed;
curand_setup(unsigned long long _seed) : seed(_seed) {}
__device__ void operator()(init_tuple t){
curandState s;
int id = thrust::get<0>(t);
curand_init(seed, id, 0, &s);
thrust::get<1>(t) = s;
}
};
int main(int argc, char** argv){
int N = 1<<18;
std::cout << "N " << N << std::endl;
tic_toc tt;
thrust::device_vector<float> val(N,1);
tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
return 0;
}
and the output is:
Time elapsed: 0.000594s
Time elapsed: 5.60026s
Time elapsed: 0.001098s
The situation did not change when I wrote my own kernel for the summation or I copied the data to thrust::host_vector
and reduced them.
Why is thrust::reduce
so slow just after initializing thrust::device_vector<curandState>
, and is there any way to avoid this problem? I would appreciate the help.
My system is Linux Mint 18.3
with kernel 4.15.0-23-generic
.
output of nvcc --version
:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
Upvotes: 1
Views: 601
Reputation: 72349
Why is
thrust::reduce
so slow just after initializingthrust::device_vector<curandState>
It isn't. The source of your confusion is your time measurement, which is incorrect.
In general, thrust API calls which operate on the device are asynchronous on the host. The only exceptions are calls which return a value (and thrust::reduce
is one of those). As a result, the middle call in your code is not only measuring the execution time of thrust::reduce
, but also the prior thrust::for_each_n
call, and it is that prior call which is much slower.
You can confirm this to yourself in two ways. If you modify your thrust code like this:
tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
cudaDeviceSynchronize(); // wait until for_each is complete
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
You should get something like this:
$ nvcc -arch=sm_52 -std=c++11 -o slow_thrust slow_thrust.cu
$ ./slow_thrust
N 262144
Time elapsed: 0.000471s
Time elapsed: 0.000621s
Time elapsed: 0.000448s
i.e. when you use cudaDeviceSynchronize()
to capture the runtime of the prior call, all the reduce calls have about the same runtime. Alternatively you can use a profiling tool on your original code, something like:
$ nvprof --print-gpu-trace ./slow_thrust
N 262144
==7870== NVPROF is profiling process 7870, command: ./slow_thrust
Time elapsed: 0.000521s
Time elapsed: 0.06983s
Time elapsed: 0.000538s
==7870== Profiling application: ./slow_thrust
==7870== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
214.30ms 7.6800us (512 1 1) (256 1 1) 8 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>(thrust::device_ptr<float>, float) [109]
214.56ms 5.8550us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [128]
214.58ms 2.7200us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [136]
214.60ms 1.1840us - - - - - 4B 3.2219MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
214.98ms 221.27us (512 1 1) (256 1 1) 20 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>(thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW) [151]
219.51ms 69.492ms (512 1 1) (256 1 1) 108 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>(thrust::use_default, thrust::use_default) [160]
289.00ms 9.5360us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [179]
289.01ms 3.4880us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [187]
289.07ms 1.3120us - - - - - 4B 2.9075MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
289.66ms 9.9520us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [211]
289.68ms 3.3280us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [219]
289.69ms 1.3120us - - - - - 4B 2.9075MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
There you can see that the three calls which make up a reduce operation are taking cumulatively 8-13 microseconds each, whereas the for_each_n
requires 69 milliseconds to complete.
Upvotes: 5