Malacu
Malacu

Reputation: 191

Using of shared memory not showing desired result

I am trying to learn the usage of shared memory to increase the performance. Here I am trying to copy data from global memory to shared memory. But when I have a single block (256 threads) it gives the right result while with more than one block it gives a random result.

#include <cuda.h>
#include <stdio.h>

__global__ void staticReverse(int *d, int n)
{
  __shared__ int s[400];

  int t = blockIdx.x * blockDim.x + threadIdx.x;
  d[t] = d[t]*d[t];
  s[t] =d[t];

  __syncthreads();

  d[t] = s[t];  
}


__global__ void dynamicReverse(int *d, int n)
{
  extern __shared__ int s[];
  int t = threadIdx.x;
  
  s[t] = d[t]*d[t];
  __syncthreads();
  d[t] = s[t];
}

int main(void)
{
  const int n = 400;
  int a[n], d[n];

  for (int i = 0; i < n; i++)
  {
    a[i] = i; 
  }

  int *d_d;
  cudaMalloc(&d_d, n * sizeof(int)); 

  // run version with static shared memory
  int block_size = 256;
  int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
  cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
  staticReverse<<<n_blocks,block_size>>>(d_d, n);
  cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < n; i++) 
  {
    printf("%d\n",d[i]);
  }
}
  1. What does the third argument in the

     dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
    

    kernel launch do? Does it allocat shared memory for the entire block or per thread?

  2. If I require more than 64kb of shared memory per multiprocessor with compute capability 5.0, what do I need to do?

Upvotes: 1

Views: 343

Answers (1)

Vitality
Vitality

Reputation: 21475

In your static shared memory allocation code you had three issues:

  1. The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
  2. You should use local thread index for indexing shared memory, instead of the global one;
  3. You had no array out of bounds checking.

The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.

Here is the complete working code:

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>

__global__ void staticReverse(int *d, int n)
{
    __shared__ int s[256];

    int t = blockIdx.x * blockDim.x + threadIdx.x;

    if (t < n) {
        d[t] = d[t]*d[t];
        s[threadIdx.x] =d[t];

        __syncthreads();

        d[t] = s[threadIdx.x];
    }
}


/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
    extern __shared__ int s[];
    int t = blockIdx.x * blockDim.x + threadIdx.x;

    if (t < n) {
        s[threadIdx.x] = d[t]*d[t];
        __syncthreads();
        d[t] = s[threadIdx.x];
    }
}

int main(void)
{
    const int n = 400;

    int* a = (int*) malloc(n*sizeof(int));
    int* d = (int*) malloc(n*sizeof(int));

    for (int i = 0; i < n; i++) { a[i] = i; }

    int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int))); 

    // run version with static shared memory
    int block_size = 256;
    int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);

    gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
    //staticReverse<<<n_blocks,block_size>>>(d_d, n);
    dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }

}

Upvotes: 4

Related Questions