My first 2D arrays CUDA

Question

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 

#define BLOCK_SIZE 6
#define GRID_SIZE 1

__global__ void test(int A[BLOCK_SIZE][BLOCK_SIZE], int B[BLOCK_SIZE][BLOCK_SIZE], int C[BLOCK_SIZE][BLOCK_SIZE]) {

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;

    C[i][j] = A[i][j] + B[i][j];

}

int main(){

    int A[BLOCK_SIZE][BLOCK_SIZE];
    int B[BLOCK_SIZE][BLOCK_SIZE];
    int C[BLOCK_SIZE][BLOCK_SIZE];

    for (int i = 0; i>> (dev_A, dev_B, dev_C);
    cudaDeviceSynchronize();

    cudaMemcpy(C, dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
}

I tried to copy this code How to use 2D Arrays in CUDA?.

Some website tell me to use something like

result[row*WIDTH + col] = array1[row*WIDTH + col] + array2[row*WIDTH + col];

but I don't know how to use it. My solution is always -858993460

talonmies · Accepted Answer

There is a lot wrong with the code you posted, and most of it probably related to the ambiguous way that C and related languages deal with statically declared multidimensional arrays and the [][] style indexing scheme it supports.

Rather than describe all the required fixes I will just leave this here:

#include 

#define BLOCK_SIZE 6
#define GRID_SIZE 1

template
struct array2D
{

    T* p;
    int lda;

    __device__ __host__ 
    array2D(T* _p, int cols) : p(_p), lda(cols) {}

    __device__ __host__
    T& operator()(int i, int j) { return p[i * lda + j]; }

    __device__ __host__
    T& operator()(int i, int j) const { return p[i * lda + j]; }
};


__global__ void test(array2D A, array2D B, array2D C) {

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;

    C(i,j) = A(i,j) + B(i,j);
}

int main(){

    int A[BLOCK_SIZE][BLOCK_SIZE];
    int B[BLOCK_SIZE][BLOCK_SIZE];
    int C[BLOCK_SIZE][BLOCK_SIZE];

    for (int i = 0; i>> (array2D(dev_A, BLOCK_SIZE),
                                   array2D(dev_B, BLOCK_SIZE), 
                                   array2D(dev_C, BLOCK_SIZE));
    cudaDeviceSynchronize();

    cudaMemcpy(C, dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i



The most important feature of the code is the use of a tiny wrapper class which provides you with the (i,j) style indexing you apparently want without any complexity in the kernel code. At this point you don't even need to understand how it works, just accept that it provides you with the necessary indexing mechanism you want within the kernel and use it.

It you compile and run the code like so:

$ nvcc --std=c++11 myfirstpony.cu -o myfirstpony

$ ./myfirstpony
(0,0) = 0 {0}
(0,1) = 2 {2}
(0,2) = 4 {4}
(0,3) = 6 {6}
(0,4) = 8 {8}
(0,5) = 10 {10}
(1,0) = 2 {2}
(1,1) = 4 {4}
(1,2) = 6 {6}
(1,3) = 8 {8}
(1,4) = 10 {10}
(1,5) = 12 {12}
(2,0) = 4 {4}
(2,1) = 6 {6}
(2,2) = 8 {8}
(2,3) = 10 {10}
(2,4) = 12 {12}
(2,5) = 14 {14}
(3,0) = 6 {6}
(3,1) = 8 {8}
(3,2) = 10 {10}
(3,3) = 12 {12}
(3,4) = 14 {14}
(3,5) = 16 {16}
(4,0) = 8 {8}
(4,1) = 10 {10}
(4,2) = 12 {12}
(4,3) = 14 {14}
(4,4) = 16 {16}
(4,5) = 18 {18}
(5,0) = 10 {10}
(5,1) = 12 {12}
(5,2) = 14 {14}
(5,3) = 16 {16}
(5,4) = 18 {18}
(5,5) = 20 {20}


You can see for yourself the correctness of the result.

My first 2D arrays CUDA

Answers (2)

Related Questions