Some elements were not calculated in a vector addition on cuda

Question

Here is the code:

#include "common/book.h"

#define N 36 

__global__ void add(int *a, int *b, int *c) {
    int tid = blockIdx.x * gridDim.y * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 
    if(tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N]; 
    int *dev_a, *dev_b, *dev_c; 
    cudaMalloc( (void**) &dev_a, N * sizeof(int));
    cudaMalloc( (void**) &dev_b, N * sizeof(int));
    cudaMalloc( (void**) &dev_c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = -1; 
        b[i] = i * i;
    }

    cudaMemcpy(
                dev_a, 
                a, 
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    cudaMemcpy(
                dev_b, 
                b, 
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    dim3 grid_dim(3, 2);
    dim3 block_dim(3, 2);
    add<<>>(dev_a, dev_b, dev_c);
    cudaMemcpy(
                c, 
                dev_c, 
                N * sizeof(int),
                cudaMemcpyDeviceToHost
                   );
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d
", a[i], b[i], c[i]);
    }
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}

Basically, I was trying to add two vectors element-wise, on a grid with 3x2 layout, each block in the grid having a 3x2 layout of threads.

Here is the result when I run the compiled binary:

-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 0
-1 + 49 = 0
-1 + 64 = 0
-1 + 81 = 0
-1 + 100 = 0
-1 + 121 = 0
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 0
-1 + 361 = 0
-1 + 400 = 0
-1 + 441 = 0
-1 + 484 = 0
-1 + 529 = 0
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 0
-1 + 961 = 0
-1 + 1024 = 0
-1 + 1089 = 0
-1 + 1156 = 0
-1 + 1225 = 0

Apparently some blocks are just ignored. I've also tried to play around with how the tid is calculated in the kernel function add, but there are always some blocks missing.

Any suggestions?

Robert Crovella · Accepted Answer

The only problem is with your tid calculation as you have already surmised.

There are many ways to perform the mapping and also to create the arithmetic. For general purpose 2D grids, I find it convenient (i.e. an easy-to-remember methodology) to create 2D indices in x and y, and then use the grid width (in x) multiplied by the y index, plus the x index, to create a thread-unique 1-D index:

int idy = threadIdx.y+blockDim.y*blockIdx.y;  // y-index
int idx = threadIdx.x+blockDim.x*blockIdx.x;  // x-index
int tid = gridDim.x*blockDim.x*idy + idx;     // thread-unique 1D index

gridDim.x*blockDim.x is the grid width in x, expressed in units of threads.

When we use this general-purpose 2D indexing scheme in your code, it seems to work correctly for me:

$ cat t10.cu
#include 

#define N 36

__global__ void add(int *a, int *b, int *c) {
    int idy = threadIdx.y+blockDim.y*blockIdx.y;
    int idx = threadIdx.x+blockDim.x*blockIdx.x;
    int tid = gridDim.x*blockDim.x*idy + idx;
    if(tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;
    cudaMalloc( (void**) &dev_a, N * sizeof(int));
    cudaMalloc( (void**) &dev_b, N * sizeof(int));
    cudaMalloc( (void**) &dev_c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = -1;
        b[i] = i * i;
    }

    cudaMemcpy(
                dev_a,
                a,
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    cudaMemcpy(
                dev_b,
                b,
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    dim3 grid_dim(3, 2);
    dim3 block_dim(3, 2);
    add<<>>(dev_a, dev_b, dev_c);
    cudaMemcpy(
                c,
                dev_c,
                N * sizeof(int),
                cudaMemcpyDeviceToHost
                   );
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d
", a[i], b[i], c[i]);
    }
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}
$ nvcc -arch=sm_35 -o t10 t10.cu
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 35
-1 + 49 = 48
-1 + 64 = 63
-1 + 81 = 80
-1 + 100 = 99
-1 + 121 = 120
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 323
-1 + 361 = 360
-1 + 400 = 399
-1 + 441 = 440
-1 + 484 = 483
-1 + 529 = 528
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 899
-1 + 961 = 960
-1 + 1024 = 1023
-1 + 1089 = 1088
-1 + 1156 = 1155
-1 + 1225 = 1224
========= ERROR SUMMARY: 0 errors
$

The above should provide the correct result. With respect to performance, this may not be the most efficient mapping for this toy problem. This problem has threadblock sizes that are not multiples of 32, which is generally not recommended for efficient CUDA programming. Rather than try and come up with the optimal mapping (in terms of performance/efficiency) for this case, my suggestion would be to reorganize your threadblocks to provide at least a multiple of 32 threads per block, and I would also recommend considering at least 16 or 32 threads in the x-dimension of the block, to making indexing easy to understand as well as yielding approximately optimal memory access performance.

Some elements were not calculated in a vector addition on cuda

Answers (1)

Related Questions