Reputation: 1
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BLOCK_SIZE 6
#define GRID_SIZE 1
__global__ void test(int A[BLOCK_SIZE][BLOCK_SIZE], int B[BLOCK_SIZE][BLOCK_SIZE], int C[BLOCK_SIZE][BLOCK_SIZE]) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
C[i][j] = A[i][j] + B[i][j];
}
int main(){
int A[BLOCK_SIZE][BLOCK_SIZE];
int B[BLOCK_SIZE][BLOCK_SIZE];
int C[BLOCK_SIZE][BLOCK_SIZE];
for (int i = 0; i<BLOCK_SIZE; i++)
for (int j = 0; j<BLOCK_SIZE; j++){
A[i][j] = i + j;
B[i][j] = i + j;
}
int dev_A[BLOCK_SIZE][BLOCK_SIZE];
int dev_B[BLOCK_SIZE][BLOCK_SIZE];
int dev_C[BLOCK_SIZE][BLOCK_SIZE];
cudaMalloc((void**)&dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMalloc((void**)&dev_A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMalloc((void**)&dev_B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMemcpy(dev_A, A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 36 in this case
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
test <<<dimGrid, dimBlock >>> (dev_A, dev_B, dev_C);
cudaDeviceSynchronize();
cudaMemcpy(C, dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
}
I tried to copy this code How to use 2D Arrays in CUDA?.
Some website tell me to use something like
result[row*WIDTH + col] = array1[row*WIDTH + col] + array2[row*WIDTH + col];
but I don't know how to use it. My solution is always -858993460
Upvotes: 0
Views: 181
Reputation: 2795
There are two main issues to your code:
Firstly, when you define an array within function scope like this:
int dev_A[BLOCK_SIZE][BLOCK_SIZE];
This creates an array of arrays in host memory which is stored contiguously on the stack. This array can be used straight away from host code without further allocating any memory for it. This is a real C array and not a pointer. While this is fine and correct for A
, B
and C
, this will not suffice for your declarations of dev_A
, dev_B
and dev_C
, as you require memory allocated on the device for these.
There are a couple of ways to correct this. One way is to instead use a pointer to an array of arrays of ints. The syntax for such a declaration is as follows:
int (*dev_A)[BLOCK_SIZE][BLOCK_SIZE];
If you go by this approach, I would recommend changing your cudaMalloc
and cudaMemcpy
calls as follows:
cudaMalloc((void **) &dev_A, sizeof *dev_A);
// ...
cudaMemcpy(dev_A, &A, sizeof *dev_A, cudaMemcpyHostToDevice);
The difference here is that using sizeof *dev_A
is the same as writing sizeof(int [BLOCK_SIZE][BLOCK_SIZE])
, which gives the number of bytes taken up by the entire host array, and using &A
instead of A
, since &A
gives a pointer to an array of arrays, while A
decays to a pointer to an array. Technically what you already have should evaluate to the exact same values, since the size of an array is equal to the size of its elements multiplied by its length, and also a pointer to an array points to the same address as the first element in that array, however it would be more correct and consistent with how you would use cudaMalloc
and cudaMemcpy
with any other non-array type, and rightly treats the array of arrays as one single value:
int A, *dev_A;
cudaMalloc((void **) &dev_A, sizeof *dev_A);
cudaMemcpy(dev_A, &A, sizeof *dev_A, cudaMemcpyHostToDevice);
The other approach would be to dynamically allocate memory for multiple contiguous int [BLOCK_SIZE]
s rather than a single int [BLOCK_SIZE][BLOCK_SIZE]
, which could be done as follows:
int (*dev_A)[BLOCK_SIZE];
// ...
cudaMalloc((void **) &dev_A, sizeof *dev_A * BLOCK_SIZE);
// ...
cudaMemcpy(dev_A, A, sizeof *dev_A * BLOCK_SIZE, cudaMemcpyHostToDevice);
This means dev_A
now represents a pointer to an array of BLOCK_SIZE
ints which is the first element of a sequence of BLOCK_SIZE
contiguous arrays in memory. Notice how this time, A
is used for cudaMemcpy
rather than &A
, as A
's int [BLOCK_SIZE][BLOCK_SIZE]
type decays to int (*)[BLOCK_SIZE]
which matches the type of dev_A
. Technically speaking, all the approaches mentioned so far do exactly the same thing and pass the same numerical values to the cudaMalloc
and cudaMemcpy
functions, however, the type of dev_A
, dev_B
and dev_C
is important for how the arrays are used later.
The second issue with your code is in the signature of the test
kernel function itself. This function has parameters declared like int A[BLOCK_SIZE][BLOCK_SIZE]
, however, in C (and C++), when you declare an array parameter in a function, it is instead adjusted to actually be a pointer to the array's element type. So int A[N]
as a function parameter actually declares int *A
, and the size is ignored. In the case of arrays of arrays, such as int A[N][M]
, this is converted to int (*A)[M]
, which means your parameters are int (*)[BLOCK_SIZE]
(pointer to an array of BLOCK_SIZE
ints) and your function currently has the following effective signature:
__global__
void test(int (*A)[BLOCK_SIZE],
int (*B)[BLOCK_SIZE],
int (*C)[BLOCK_SIZE])
If you stick with this function signature, then if you follow the approach of making dev_A
and friends of type int (*)[BLOCK_SIZE]
, then your code should work as is, as the expression A[i][j]
in your function first locates and dereferences the i
th array after the address A
, and then this array value decays into an int *
pointer, and then the j
th int
after this address is accessed. However if you take the approach of declaring your device pointers as int (*dev_A)[BLOCK_SIZE][BLOCK_SIZE]
, then you will either have to dereference these pointers when calling your kernel like so (which should be fine as the dereferenced array immediately decays into a pointer so device memory should not be accessed from host code):
test<<<dimGrid, dimBlock>>>(*dev_A, *dev_B, *dev_C);
Or alternatively, the signature of the test
function can be changed as follows:
__global__
void test(int (*A)[BLOCK_SIZE][BLOCK_SIZE],
int (*B)[BLOCK_SIZE][BLOCK_SIZE],
int (*C)[BLOCK_SIZE][BLOCK_SIZE])
When doing so however, these pointers-to-arrays must be first dereferenced before accessing their data, so your code within your function will have to be changed as follows:
(*C)[i][j] = (*A)[i][j] + (*B)[i][j];
Using plain C arrays, arrays of arrays, pointers to arrays, and pointers to arrays of arrays can have quite confusing semantics, and also requires your array's size to be fixed at compile-time, so you may prefer instead of using any of these approaches to use a single linear sequence of int
s, and then index the elements yourself, for example:
void test(int *A)
{
A[row * BLOCK_SIZE + col] = 123;
}
Device memory for this can easily be allocated as follows:
int *dev_A;
cudaMalloc((void **) &dev_A, sizeof *dev_A * BLOCK_SIZE * BLOCK_SIZE);
An important note is that CUDA code is not C and is actually C++, however your code and the code discussed in this answer is both valid C and C++ (ignoring CUDA extensions). This may create some additional obstacles when writing C-like code, for example having to explicitly cast void *
values to other pointer types, but also allows you to make use of useful C++ features such as operator overloading, as featured in talonmies's answer, to encapsulate addressing a 2D grid of values within a single linear buffer of data (so you can write A(row, col)
instead of A[row * BLOCK_SIZE + col]
).
Upvotes: 1
Reputation: 72342
There is a lot wrong with the code you posted, and most of it probably related to the ambiguous way that C and related languages deal with statically declared multidimensional arrays and the [][]
style indexing scheme it supports.
Rather than describe all the required fixes I will just leave this here:
#include <stdio.h>
#define BLOCK_SIZE 6
#define GRID_SIZE 1
template<typename T>
struct array2D
{
T* p;
int lda;
__device__ __host__
array2D(T* _p, int cols) : p(_p), lda(cols) {}
__device__ __host__
T& operator()(int i, int j) { return p[i * lda + j]; }
__device__ __host__
T& operator()(int i, int j) const { return p[i * lda + j]; }
};
__global__ void test(array2D<int> A, array2D<int> B, array2D<int> C) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
C(i,j) = A(i,j) + B(i,j);
}
int main(){
int A[BLOCK_SIZE][BLOCK_SIZE];
int B[BLOCK_SIZE][BLOCK_SIZE];
int C[BLOCK_SIZE][BLOCK_SIZE];
for (int i = 0; i<BLOCK_SIZE; i++) {
for (int j = 0; j<BLOCK_SIZE; j++){
A[i][j] = i + j;
B[i][j] = i + j;
}
}
int* dev_A; cudaMalloc((void**)&dev_A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
int* dev_B; cudaMalloc((void**)&dev_B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
int* dev_C; cudaMalloc((void**)&dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMemcpy(dev_A, A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 36 in this case
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
test <<<dimGrid, dimBlock >>> (array2D<int>(dev_A, BLOCK_SIZE),
array2D<int>(dev_B, BLOCK_SIZE),
array2D<int>(dev_C, BLOCK_SIZE));
cudaDeviceSynchronize();
cudaMemcpy(C, dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i<BLOCK_SIZE; i++) {
for (int j = 0; j<BLOCK_SIZE; j++){
printf("(%d,%d) = %d {%d}\n", i, j, C[i][j], A[i][j] + B[i][j]);
}
}
return 0;
}
The most important feature of the code is the use of a tiny wrapper class which provides you with the (i,j)
style indexing you apparently want without any complexity in the kernel code. At this point you don't even need to understand how it works, just accept that it provides you with the necessary indexing mechanism you want within the kernel and use it.
It you compile and run the code like so:
$ nvcc --std=c++11 myfirstpony.cu -o myfirstpony
$ ./myfirstpony
(0,0) = 0 {0}
(0,1) = 2 {2}
(0,2) = 4 {4}
(0,3) = 6 {6}
(0,4) = 8 {8}
(0,5) = 10 {10}
(1,0) = 2 {2}
(1,1) = 4 {4}
(1,2) = 6 {6}
(1,3) = 8 {8}
(1,4) = 10 {10}
(1,5) = 12 {12}
(2,0) = 4 {4}
(2,1) = 6 {6}
(2,2) = 8 {8}
(2,3) = 10 {10}
(2,4) = 12 {12}
(2,5) = 14 {14}
(3,0) = 6 {6}
(3,1) = 8 {8}
(3,2) = 10 {10}
(3,3) = 12 {12}
(3,4) = 14 {14}
(3,5) = 16 {16}
(4,0) = 8 {8}
(4,1) = 10 {10}
(4,2) = 12 {12}
(4,3) = 14 {14}
(4,4) = 16 {16}
(4,5) = 18 {18}
(5,0) = 10 {10}
(5,1) = 12 {12}
(5,2) = 14 {14}
(5,3) = 16 {16}
(5,4) = 18 {18}
(5,5) = 20 {20}
You can see for yourself the correctness of the result.
Upvotes: 0