Reputation: 57
Here is my problem. I have the following structure in my kernel.h
.
struct __Q_VECTOR__{
double* Data;
int Dimension;
int Cluster;
};
typedef struct __Q_VECTOR__ VQ_VECTOR;
In the kernel.cu
I have the following code
int main(void){
int L = 3, //.Data length
N = 100;
VQ_VECTOR *A,
*device_VQ_VECTOR;
cudaError_t cudaStatus;
A = (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
for(int i=0; i<N; i++){
VQ_VECTOR a;
a.Data = (double*)malloc(L*sizeof(double));;
a.Cluster = 1;
a.Dimension = L;
for(int j=0; j<L; j++)
a.Data[j]=i*j;
A[i] = a;
}
//Prinf of all the elements of A
for(int i=0; i<2; i++){
printf("\nA[%d]={");
for(int j=0; j<L; j++)
printf("%.3f",A[i].Data[j]);
printf("}");
}
printf("\n\n");
//I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory
cudaDeviceReset();
cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
cudaPrintfInit();
testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return 1;
}
cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return 1;
}
for(int i=0; i<2; i++){
printf("\nA[%d]={");
for(int j=0; j<L; j++)
printf("%.3f",A[i].Data[j]);
printf("}");
}
cudaFree(device_VQ_VECTOR);
return 0;
}
when I build, sometimes it prints nothing, sometimes it works.
what's wrong in my code? May be caused by
cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N* sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
Help please!
Upvotes: 1
Views: 970
Reputation: 18409
This cannot work because arrays are allocated separately, and not copied into device memory. You need to allocate them on device too, and make full copy. To make it worse, you cannot access device memory from host side directly (other way than cudaMemcpy
), so you can't use e.g. cudaMalloc(&device_VQ_VECTOR[i].Data, ...)
(it will crash).
Here is an example code. For simplicity sake, it drops host-side A[i].Data
and then re-creates them. It isn't too good, but that will go.
struct __Q_VECTOR__{
double* Data;
int Dimension;
int Cluster;
};
typedef struct __Q_VECTOR__ VQ_VECTOR;
__global__ void testKernel(VQ_VECTOR *X, int N){
int i= blockIdx.x*blockDim.x + threadIdx.x;
cuPrintf("\n testKernel entrance by the global threadIdx= %d\n", i);
for(int k=0; k<X[i].Dimension; k++)
cuPrintf("%2.2f, ",X[i].Data[k]);
cuPrintf("\n");
}
int main(void){
int L = 3, //.Data length
N = 100;
VQ_VECTOR *A,
*device_VQ_VECTOR;
cudaError_t cudaStatus;
A = (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
for(int i=0; i<N; i++){
VQ_VECTOR a;
a.Data = (double*)malloc(L*sizeof(double));;
a.Cluster = 1;
a.Dimension = L;
for(int j=0; j<L; j++)
a.Data[j]=(1+i)*(1+j);
A[i] = a;
}
//Prinf of all the elements of A
for(int i=0; i<2; i++){
printf("\nA[%d]={", i);
for(int j=0; j<L; j++)
printf("%.3f",A[i].Data[j]);
printf("}\n");
}
printf("\n\n");
//I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory
cudaDeviceReset();
cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
for(int i = 0; i != N; ++i) {
/* can't access device_VQ_VECTOR[i].Data directly from host-side,
* working around it with proxy variable */
double *out;
cudaMalloc(&out, L*sizeof(double));
cudaMemcpy(out, A[i].Data, L*sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(&device_VQ_VECTOR[i].Data, &out, sizeof(void*),
cudaMemcpyHostToDevice);
// will re-allocate later, for simplicity sake
free(A[i].Data);
}
cudaPrintfInit();
testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return 1;
}
cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
for(int i = 0; i != N; ++i) {
// allocate array, copy data
double *array = (double*)malloc(L*sizeof(double));
cudaMemcpy(array, A[i].Data, L*sizeof(double),
cudaMemcpyDeviceToHost);
// assign new array to A[i]
A[i].Data = array;
}
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return 1;
}
/* for(int i=0; i<2; i++){
printf("\nA[%d]={", i);
for(int j=0; j<L; j++)
printf("%.3f",A[i].Data[j]);
printf("}\n");
}*/
cudaFree(device_VQ_VECTOR);
// don't forget to free A and all its Data
return 0;
}
Part of output would be (it is huge, I don't want to post too much):
[2, 0]: 3.00, [18, 0]: 19.00, [22, 0]: 23.00, [16, 0]: 17.00,
[24, 0]: 25.00, [19, 0]: 20.00, [4, 0]: 5.00, [23, 0]: 24.00,
[3, 0]: 4.00, [5, 0]: 6.00, [13, 0]: 14.00, [1, 0]: 2.00,
[10, 0]: 11.00, [6, 0]: 7.00, [14, 0]: 15.00, [0, 0]: 1.00, [20, 0]:
Upvotes: 2