Reputation: 33
I am trying to implement a structure that holds arrays of data and I want to implement dynamic array, something like:
struct myStruct {
float3 *data0, *data1;
};
__global__ void kernel(myStruct input) {
unsigned int N = 2;
while(someStatements) {
data0 = new float3[N];
// do somethings
N *= 2;
}
}
How can I do something like this in a CUDA kernel?
Upvotes: 3
Views: 3975
Reputation: 72348
If you are going to run this code on either a compute capability 2.x or 3,x device, with a recent version of CUDA, your kernel code is very nearly correct. The C++ new
operator is supported in CUDA 4.x and 5.0 on Fermi and Kepler hardware. Note that memory which is allocated using new
or malloc
is allocated on runtime heap on the device. It has the lifespan of the context in which is was created, but you currently cannot directly access it from the CUDA host API (so via cudaMemcpy
or similar).
I turned your structure and kernel into a simple example code which you can try for yourself to see how it works:
#include <cstdio>
struct myStruct {
float *data;
};
__device__
void fill(float * x, unsigned int n)
{
for(int i=0; i<n; i++) x[i] = (float)i;
}
__global__
void kernel(myStruct *input, const unsigned int imax)
{
for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
float * p = new float[N];
fill(p, N);
input[i].data = p;
}
}
__global__
void kernel2(myStruct *input, float *output, const unsigned int imax)
{
for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
output[i] = input[i].data[N-1];
}
}
inline void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main(void)
{
const unsigned int nvals = 16;
struct myStruct * _s;
float * _f, * f;
gpuErrchk( cudaMalloc((void **)&_s, sizeof(struct myStruct) * size_t(nvals)) );
size_t sz = sizeof(float) * size_t(nvals);
gpuErrchk( cudaMalloc((void **)&_f, sz) );
f = new float[nvals];
kernel<<<1,1>>>(_s, nvals);
gpuErrchk( cudaPeekAtLastError() );
kernel2<<<1,1>>>(_s, _f, nvals);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy(f, _f, sz, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaDeviceReset() );
for(int i=0; i<nvals; i++) {
fprintf(stdout, "%d %f\n", i, f[i]);
}
return 0;
}
A few points to note:
nvcc -arch=sm_30 -Xptxas="-v" -o dynstruct dynstruct.cu
to compile for a GTX 670 on linux)cudaMemcpy
not being able to copy directly from addresses in runtime heap memory. I was hoping this might be fixed in CUDA 5.0, but the most recent release candidate still has this restriction.Upvotes: 1