Reputation: 29
I have the following CUDA kernel:
__global__ void combine_kernel(const uint64_t* __restrict__ d_D1,unsigned int hashedFrameNumber)
{
//1D GRID OF 1D BLOCKS
int tid = threadIdx.x + blockDim.x * blockIdx.x;
// SIZE = 2118760
if (tid < 2118760)
{
__shared__ uint64_t d_D11[10];
for(int i = threadIdx.x; i < 10; i++)
{
d_D11[i] = d_D1[i];
}
__syncthreads();
//POINT A
//check if d_D1 is unshuffled
if(tid == SIZE-1)
{
printf("%llx \n",d_D1[0]);
printf("%llx \n",d_D1[1]);
printf("%llx \n",d_D1[2]);
printf("%llx \n",d_D1[3]);
printf("%llx \n",d_D1[4]);
printf("%llx \n",d_D1[5]);
printf("%llx \n",d_D1[6]);
printf("%llx \n",d_D1[7]);
printf("%llx \n",d_D1[8]);
printf("%llx \n",d_D1[9]);
printf("\n\n");
}
//POINT B
//check if shared d_D11 is unshuffled
if(tid == SIZE-1)
{
printf("%llx \n",d_D11[0]);
printf("%llx \n",d_D11[1]);
printf("%llx \n",d_D11[2]);
printf("%llx \n",d_D11[3]);
printf("%llx \n",d_D11[4]);
printf("%llx \n",d_D11[5]);
printf("%llx \n",d_D11[6]);
printf("%llx \n",d_D11[7]);
printf("%llx \n",d_D11[8]);
printf("%llx \n",d_D11[9]);
printf("\n\n");
}
curandState randState;
curand_init(hashedFrameNumber, 0, 0, &randState);
if(threadIdx.x == 0)
{
for (unsigned int i = 9; i > 0; i--)
{
size_t j = (unsigned int) (((curand(&randState) / 32768)*(i+1)) % 10);
uint64_t t0 = d_D11[j];
d_D11[j] = d_D11[i];
d_D11[i] = t0;
}
}
__syncthreads();
//POINT C
//check if d_D1 is shuffled
if(tid == SIZE-1)
{
printf("%llx \n",d_D1[0]);
printf("%llx \n",d_D1[1]);
printf("%llx \n",d_D1[2]);
printf("%llx \n",d_D1[3]);
printf("%llx \n",d_D1[4]);
printf("%llx \n",d_D1[5]);
printf("%llx \n",d_D1[6]);
printf("%llx \n",d_D1[7]);
printf("%llx \n",d_D1[8]);
printf("%llx \n",d_D1[9]);
printf("\n\n");
}
__syncthreads();
}
}
What happens is that, when I check if d_D11
is unshuffled at POINT B
, it is unshuffled if tid
is between 0 and 31, otherwise is shuffled, so what am I doing wrong? Is that the correct way of using the shared memory?
d_D1
contains 10 elements. I just want to pass the 10 elements of array d_D1
to the shared array d_D11
, then shuffle the shared array and use it.
Upvotes: 0
Views: 123
Reputation: 7265
You need a __syncthreads()
barrier before the shuffling code to prevent the previous printf()
s to print shuffled or partially shuffled values, and again before POINT C
to ensure the shuffling has finished by the time the result gets printed.
Aside from this correctness issues, your code unnecessarily initializes some elements multiple times.
Initialize your shared memory like this, assuming your blocks are one-dimensional and have at least 10 threads each:
__shared__ uint64_t d_D11[10];
unsigned int tidx = threadIdx.x;
if (tidx < 10) {
d_D11[tidx] = d_D1[tidx];
}
__syncthreads();
If your shared array may have more elements than there are threads, you can use the following idiom:
const int N = ...
__shared__ uint64_t d_D11[N];
for(int i = threadIdx.x; i < N; i += blockDim.x)
d_D11[i] = d_D1[i];
}
__syncthreads();
Upvotes: 2