Nofal
Nofal

Reputation: 370

Array of structs of arrays CUDA C

I'm fairly new to CUDA and i've been looking around to create and array of structs of arrays and i found a couple solutions , but none gives me a clear idea .

here Harrism explained a pass by value for a struct which works fine, but when trying to add this approach to it i get illegal memory access .

what im trying to achieve is an array of structs each struct with a pointer to a dynamically allocated array populated on the host and my kernel to be able to read values of array from desired index of AoS and use it in calculations inside the kernel .

what have I not understood from these 2 codes and how would I be able to join these ideas together ?
what i tried (an attempt with array of 2 structs with 1 array each):

#include <stdio.h>
#include <stdlib.h>
#define N 10
__inline __host__ void gpuAssert(cudaError_t code, char *file, int line, 
    bool abort=true)
{
if (code != cudaSuccess) 
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef struct StructA {
    int* arr;
} StructA;

__global__ void kernel2(StructA *in)
{
    in[0].arr[threadIdx.x] = 0;
    in[1].arr[threadIdx.x] = 1;
    printf("d_arr = %d , d_arr2 = %d \n",in[0].arr[threadIdx.x],in[1].arr[threadIdx.x]);
}



int main(){
int* h_arr;
int* h_arr2;
h_arr = (int*)malloc(N*sizeof(int));
h_arr2 = (int*)malloc(N*sizeof(int));
StructA *h_a;
h_a = (StructA*)malloc(sizeof(StructA) * 2);
int *d_arr;
int *d_arr2;
h_arr[0]=1;h_arr[1]=2;h_arr[2]=3,h_arr[3]=4,h_arr[4]=5;h_arr[5]=6;h_arr[6]=7;h_arr[7]=8;h_arr[8]=9;h_arr[9]=10;
h_arr2[0]=1;h_arr2[1]=2;h_arr2[2]=3,h_arr2[3]=4,h_arr2[4]=5;h_arr2[5]=6;h_arr2[6]=7;h_arr2[7]=8;h_arr2[8]=9;h_arr2[9]=10;
// 1. Allocate device array.
gpuErrchk(cudaMalloc((void**) &(d_arr), sizeof(int)*N));
gpuErrchk(cudaMalloc((void**) &(d_arr2), sizeof(int)*N));

// 2. Copy array contents from host to device.
gpuErrchk(cudaMemcpy(d_arr, h_arr, sizeof(int)*N, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_arr2, h_arr2, sizeof(int)*N, cudaMemcpyHostToDevice));

// 3. Point to device pointer in host struct.
h_a[0].arr = d_arr;
h_a[1].arr = d_arr2;

// 4. Call kernel with host struct as argument
kernel2<<<1,N>>>(h_a);
gpuErrchk(cudaPeekAtLastError());
//gpuErrchk(cudaDeviceSynchronize());
// 5. Copy pointer from device to host.
gpuErrchk(cudaMemcpy(h_arr, d_arr, sizeof(int)*N, cudaMemcpyDeviceToHost));

// 6. Point to host pointer in host struct 
//    (or do something else with it if this is not needed)
//h_a.arr = h_arr;
printf("\n%d %d %d %d %d %d %d %d %d %d \n",h_arr[0],h_arr[1],h_arr[2],h_arr[3],h_arr[4],h_arr[5],h_arr[6],h_arr[7],h_arr[8],h_arr[9]);
printf("\n%d %d %d %d %d %d %d %d %d %d \n",h_arr2[0],h_arr2[1],h_arr2[2],h_arr2[3],h_arr2[4],h_arr2[5],h_arr2[6],h_arr2[7],h_arr2[8],h_arr2[9]);
return 0;
}

Upvotes: 1

Views: 1676

Answers (2)

Robert Crovella
Robert Crovella

Reputation: 151799

Your code is mostly correct.

A basic CUDA principle is that you cannot (should not) dereference a host pointer in device code, or a device pointer in host code.

This is a host pointer:

StructA *h_a;
h_a = (StructA*)malloc(sizeof(StructA) * 2);

This is passing it to device code (where it will be dereferenced):

kernel2<<<1,N>>>(h_a);

We can fix this with some additional code to copy the structs pointed to by h_a to device memory in a new set of structs allocated by d_a, with a corresponding change to the kernel invocation:

// 3a. Copy host structs to device
StructA *d_a;
cudaMalloc(&d_a, sizeof(StructA)*2);
cudaMemcpy(d_a, h_a, sizeof(StructA)*2, cudaMemcpyHostToDevice);


// 4. Call kernel with device struct as argument
kernel2<<<1,N>>>(d_a);

Here is a full example:

$ cat t4.cu
#include <stdio.h>
#include <stdlib.h>
#define N 10
__inline __host__ void gpuAssert(cudaError_t code, const char *file, int line,
    bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef struct StructA {
    int* arr;
} StructA;

__global__ void kernel2(StructA *in)
{
    in[0].arr[threadIdx.x] = 0;
    in[1].arr[threadIdx.x] = 1;
    printf("d_arr = %d , d_arr2 = %d \n",in[0].arr[threadIdx.x],in[1].arr[threadIdx.x]);
}



int main(){
int* h_arr;
int* h_arr2;
h_arr = (int*)malloc(N*sizeof(int));
h_arr2 = (int*)malloc(N*sizeof(int));
StructA *h_a;
h_a = (StructA*)malloc(sizeof(StructA) * 2);
int *d_arr;
int *d_arr2;
h_arr[0]=1;h_arr[1]=2;h_arr[2]=3,h_arr[3]=4,h_arr[4]=5;h_arr[5]=6;h_arr[6]=7;h_arr[7]=8;h_arr[8]=9;h_arr[9]=10;
h_arr2[0]=1;h_arr2[1]=2;h_arr2[2]=3,h_arr2[3]=4,h_arr2[4]=5;h_arr2[5]=6;h_arr2[6]=7;h_arr2[7]=8;h_arr2[8]=9;h_arr2[9]=10;
// 1. Allocate device array.
gpuErrchk(cudaMalloc((void**) &(d_arr), sizeof(int)*N));
gpuErrchk(cudaMalloc((void**) &(d_arr2), sizeof(int)*N));

// 2. Copy array contents from host to device.
gpuErrchk(cudaMemcpy(d_arr, h_arr, sizeof(int)*N, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_arr2, h_arr2, sizeof(int)*N, cudaMemcpyHostToDevice));

// 3. Point to device pointer in host struct.
h_a[0].arr = d_arr;
h_a[1].arr = d_arr2;

// 3a. Copy host structs to device
StructA *d_a;
cudaMalloc(&d_a, sizeof(StructA)*2);
cudaMemcpy(d_a, h_a, sizeof(StructA)*2, cudaMemcpyHostToDevice);


// 4. Call kernel with device struct as argument
kernel2<<<1,N>>>(d_a);
gpuErrchk(cudaPeekAtLastError());
//gpuErrchk(cudaDeviceSynchronize());
// 5. Copy pointer from device to host.
gpuErrchk(cudaMemcpy(h_arr, d_arr, sizeof(int)*N, cudaMemcpyDeviceToHost));

// 6. Point to host pointer in host struct
//    (or do something else with it if this is not needed)
//h_a.arr = h_arr;
printf("\n%d %d %d %d %d %d %d %d %d %d \n",h_arr[0],h_arr[1],h_arr[2],h_arr[3],h_arr[4],h_arr[5],h_arr[6],h_arr[7],h_arr[8],h_arr[9]);
printf("\n%d %d %d %d %d %d %d %d %d %d \n",h_arr2[0],h_arr2[1],h_arr2[2],h_arr2[3],h_arr2[4],h_arr2[5],h_arr2[6],h_arr2[7],h_arr2[8],h_arr2[9]);
return 0;
}
$ nvcc -o t4 t4.cu
$ ./t4
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1
d_arr = 0 , d_arr2 = 1

0 0 0 0 0 0 0 0 0 0

1 2 3 4 5 6 7 8 9 10
$

Note that the last lines of printout don't show the second array updated on the host, because you have not copied that array back from device memory to host memory (there is only one cudaMemcpy statement after your kernel code). You can fix that with another cudaMemcpy statement. I've also added const to your gpuAssert to get rid of annoying compiler warning spew.

This answer may give you some other ideas about how to handle arrays of pointers.

Upvotes: 2

einpoklum
einpoklum

Reputation: 131405

In your code, you're passing h_a to the kernel. h_a is a host-side C array. These arrays decay into pointer to their first elements when passed as parameters to functions; see:

What is array decaying?

So what your kernel gets is the address of a host-side StructA - and it can't use that. You could:

  • Copy h_a to the device side (say, into a d_a) and use that - the decaying would be fine since it's a device-side address you'll be subscripting.
  • Use a fixed-size std::array, which doesn't decay.
  • Allocate h_a to be accessible from the device as well - using cudaMallocManaged(). See this presentation for more information.

Having said that - I have a feeling you shouldn't be using that data structure at all. Why go through so much pointer dereferencing, and different, independent, arbitrary pointers in each element of the outer array? That seems rather inefficient. I'd try just arrange my data differently.

Upvotes: 0

Related Questions