CUDA: Is this the correct use of shared memory?

Question

I have the following CUDA kernel:

__global__ void combine_kernel(const uint64_t* __restrict__ d_D1,unsigned int hashedFrameNumber)
{
    //1D GRID OF 1D BLOCKS
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    // SIZE = 2118760
    if (tid < 2118760) 
    {
     
      __shared__ uint64_t d_D11[10];
 
      for(int i = threadIdx.x; i < 10; i++)
      {
       d_D11[i] = d_D1[i];    
      }
      __syncthreads();
      
      //POINT A
      //check if d_D1 is unshuffled
      if(tid == SIZE-1)
      {
        printf("%llx 
",d_D1[0]);
        printf("%llx 
",d_D1[1]);
        printf("%llx 
",d_D1[2]);
        printf("%llx 
",d_D1[3]);
        printf("%llx 
",d_D1[4]);
        printf("%llx 
",d_D1[5]);
        printf("%llx 
",d_D1[6]);
        printf("%llx 
",d_D1[7]);
        printf("%llx 
",d_D1[8]);
        printf("%llx 
",d_D1[9]);
        printf("

");    
      }
      
      //POINT B
      //check if shared d_D11 is unshuffled
      if(tid == SIZE-1)
      {
        printf("%llx 
",d_D11[0]);
        printf("%llx 
",d_D11[1]);
        printf("%llx 
",d_D11[2]);
        printf("%llx 
",d_D11[3]);
        printf("%llx 
",d_D11[4]);
        printf("%llx 
",d_D11[5]);
        printf("%llx 
",d_D11[6]);
        printf("%llx 
",d_D11[7]);
        printf("%llx 
",d_D11[8]);
        printf("%llx 
",d_D11[9]);
        printf("

");    
      }
      
     curandState randState;
     curand_init(hashedFrameNumber, 0, 0, &randState);
        
     if(threadIdx.x == 0)
     {       
      for (unsigned int i = 9; i > 0; i--)
      {
       size_t j = (unsigned int) (((curand(&randState) / 32768)*(i+1)) % 10);
       uint64_t t0 = d_D11[j];
       d_D11[j] = d_D11[i];
       d_D11[i] = t0;     
      }
     }
     __syncthreads();

     //POINT C
     //check if d_D1 is shuffled
      if(tid == SIZE-1)
      {
        printf("%llx 
",d_D1[0]);
        printf("%llx 
",d_D1[1]);
        printf("%llx 
",d_D1[2]);
        printf("%llx 
",d_D1[3]);
        printf("%llx 
",d_D1[4]);
        printf("%llx 
",d_D1[5]);
        printf("%llx 
",d_D1[6]);
        printf("%llx 
",d_D1[7]);
        printf("%llx 
",d_D1[8]);
        printf("%llx 
",d_D1[9]);
        printf("

");    
      }


     __syncthreads();
     
    }
}

What happens is that, when I check if d_D11 is unshuffled at POINT B, it is unshuffled if tid is between 0 and 31, otherwise is shuffled, so what am I doing wrong? Is that the correct way of using the shared memory?

d_D1 contains 10 elements. I just want to pass the 10 elements of array d_D1 to the shared array d_D11, then shuffle the shared array and use it.

CUDA: Is this the correct use of shared memory?

Answers (1)

Related Questions