dadson
dadson

Reputation: 57

cuda passing user defined structure to a kernel failed

Here is my problem. I have the following structure in my kernel.h.

struct   __Q_VECTOR__{
    double* Data;       
    int     Dimension;  
    int     Cluster;    
};

typedef struct __Q_VECTOR__     VQ_VECTOR;

In the kernel.cu I have the following code

int main(void){
 int L = 3, //.Data length
    N = 100;

VQ_VECTOR   *A,
            *device_VQ_VECTOR;

cudaError_t cudaStatus;

A =   (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
for(int i=0; i<N; i++){
    VQ_VECTOR a;
    a.Data = (double*)malloc(L*sizeof(double));;
    a.Cluster   =   1;
    a.Dimension =   L;
    for(int j=0; j<L; j++)
        a.Data[j]=i*j;

    A[i] = a;
}

//Prinf of all the elements of A
for(int i=0; i<2; i++){
    printf("\nA[%d]={");
    for(int j=0; j<L; j++)
        printf("%.3f",A[i].Data[j]);
    printf("}");
}

printf("\n\n");
//I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory

cudaDeviceReset();
cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
cudaPrintfInit();
testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
}
cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
}
for(int i=0; i<2; i++){
    printf("\nA[%d]={");
    for(int j=0; j<L; j++)
        printf("%.3f",A[i].Data[j]);
    printf("}");
}
cudaFree(device_VQ_VECTOR);
 return 0;

}

when I build, sometimes it prints nothing, sometimes it works.
what's wrong in my code? May be caused by

cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N* sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);

Help please!

Upvotes: 1

Views: 970

Answers (1)

keltar
keltar

Reputation: 18409

This cannot work because arrays are allocated separately, and not copied into device memory. You need to allocate them on device too, and make full copy. To make it worse, you cannot access device memory from host side directly (other way than cudaMemcpy), so you can't use e.g. cudaMalloc(&device_VQ_VECTOR[i].Data, ...) (it will crash).

Here is an example code. For simplicity sake, it drops host-side A[i].Data and then re-creates them. It isn't too good, but that will go.

struct   __Q_VECTOR__{
    double* Data;       
    int     Dimension;  
    int     Cluster;    
};

typedef struct __Q_VECTOR__     VQ_VECTOR;

__global__ void testKernel(VQ_VECTOR *X, int N){
    int i= blockIdx.x*blockDim.x + threadIdx.x;
    cuPrintf("\n testKernel entrance by the global threadIdx= %d\n", i);
    for(int k=0; k<X[i].Dimension; k++)
        cuPrintf("%2.2f, ",X[i].Data[k]);
    cuPrintf("\n");
}

int main(void){
    int L = 3, //.Data length
        N = 100;

    VQ_VECTOR   *A,
                *device_VQ_VECTOR;

    cudaError_t cudaStatus;

    A =   (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
    for(int i=0; i<N; i++){
        VQ_VECTOR a;
        a.Data = (double*)malloc(L*sizeof(double));;
        a.Cluster   =   1;
        a.Dimension =   L;
        for(int j=0; j<L; j++)
            a.Data[j]=(1+i)*(1+j);

        A[i] = a;
    }

    //Prinf of all the elements of A
    for(int i=0; i<2; i++){
        printf("\nA[%d]={", i);
        for(int j=0; j<L; j++)
            printf("%.3f",A[i].Data[j]);
        printf("}\n");
    }

    printf("\n\n");
    //I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory

    cudaDeviceReset();
    cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
    cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);

    for(int i = 0; i != N; ++i) {
        /* can't access device_VQ_VECTOR[i].Data directly from host-side,
         * working around it with proxy variable */
        double *out;
        cudaMalloc(&out, L*sizeof(double));
        cudaMemcpy(out, A[i].Data, L*sizeof(double),
                cudaMemcpyHostToDevice);
        cudaMemcpy(&device_VQ_VECTOR[i].Data, &out, sizeof(void*),
                cudaMemcpyHostToDevice);

        // will re-allocate later, for simplicity sake
        free(A[i].Data);
    }

    cudaPrintfInit();
    testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
    cudaPrintfDisplay(stdout, true);
    cudaPrintfEnd();
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
    }
    cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
    for(int i = 0; i != N; ++i) {
        // allocate array, copy data
        double *array = (double*)malloc(L*sizeof(double));
        cudaMemcpy(array, A[i].Data, L*sizeof(double),
                cudaMemcpyDeviceToHost);

        // assign new array to A[i]
        A[i].Data = array;
    }
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
    }
/*  for(int i=0; i<2; i++){
        printf("\nA[%d]={", i);
        for(int j=0; j<L; j++)
            printf("%.3f",A[i].Data[j]);
        printf("}\n");
    }*/
    cudaFree(device_VQ_VECTOR);

    // don't forget to free A and all its Data

    return 0;
}

Part of output would be (it is huge, I don't want to post too much):

[2, 0]: 3.00, [18, 0]: 19.00, [22, 0]: 23.00, [16, 0]: 17.00,
[24, 0]: 25.00, [19, 0]: 20.00, [4, 0]: 5.00, [23, 0]: 24.00,
[3, 0]: 4.00, [5, 0]: 6.00, [13, 0]: 14.00, [1, 0]: 2.00,
[10, 0]: 11.00, [6, 0]: 7.00, [14, 0]: 15.00, [0, 0]: 1.00, [20, 0]:

Upvotes: 2

Related Questions