ratin_sat
ratin_sat

Reputation: 33

CUDA, how to implement dynamic array of struct in CUDA kernel

I am trying to implement a structure that holds arrays of data and I want to implement dynamic array, something like:

struct myStruct {
  float3 *data0, *data1;
};

__global__ void kernel(myStruct input) {
  unsigned int N = 2;
  while(someStatements) {
    data0 = new float3[N];
    // do somethings
    N *= 2;
  }
}

How can I do something like this in a CUDA kernel?

Upvotes: 3

Views: 3975

Answers (1)

talonmies
talonmies

Reputation: 72348

If you are going to run this code on either a compute capability 2.x or 3,x device, with a recent version of CUDA, your kernel code is very nearly correct. The C++ new operator is supported in CUDA 4.x and 5.0 on Fermi and Kepler hardware. Note that memory which is allocated using new or malloc is allocated on runtime heap on the device. It has the lifespan of the context in which is was created, but you currently cannot directly access it from the CUDA host API (so via cudaMemcpy or similar).

I turned your structure and kernel into a simple example code which you can try for yourself to see how it works:

#include <cstdio>

struct myStruct {
    float *data;
};

__device__ 
void fill(float * x, unsigned int n)
{
    for(int i=0; i<n; i++) x[i] = (float)i;
}

__global__ 
void kernel(myStruct *input, const unsigned int imax)
{
    for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
        float * p = new float[N];
        fill(p, N);
        input[i].data = p;
    }
}

__global__
void kernel2(myStruct *input, float *output, const unsigned int imax)
{
    for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
        output[i] = input[i].data[N-1];
    }
}

inline void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

int main(void)
{

    const unsigned int nvals = 16;
    struct myStruct * _s;
    float * _f, * f;

    gpuErrchk( cudaMalloc((void **)&_s, sizeof(struct myStruct) * size_t(nvals)) );
    size_t sz = sizeof(float) * size_t(nvals);
    gpuErrchk( cudaMalloc((void **)&_f, sz) );
    f = new float[nvals];

    kernel<<<1,1>>>(_s, nvals);
    gpuErrchk( cudaPeekAtLastError() );

    kernel2<<<1,1>>>(_s, _f, nvals);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaMemcpy(f, _f, sz, cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaDeviceReset() );

    for(int i=0; i<nvals; i++) {
        fprintf(stdout, "%d %f\n", i, f[i]);
    }

    return 0;
}

A few points to note:

  1. This code will only compile and run with CUDA 4.x or 5.0 on a Fermi or Kepler GPU
  2. You must pass the correct architecture for your GPU to nvcc to compile it (for example I used nvcc -arch=sm_30 -Xptxas="-v" -o dynstruct dynstruct.cu to compile for a GTX 670 on linux)
  3. The example code uses a "gather" kernel to copy data from the structure in runtime heap to an allocation which the host API can access so that the results can be printed out. This is a work around for the limitation I mentioned earlier regarding cudaMemcpy not being able to copy directly from addresses in runtime heap memory. I was hoping this might be fixed in CUDA 5.0, but the most recent release candidate still has this restriction.

Upvotes: 1

Related Questions