Reputation: 340
I am working on learning CUDA right now. I have some basic experience with MPI so I figured I'd start with some really simple vector operations. I am trying to write a parallelized dot product thing. I am either having trouble allocating/writing memory to the CUDA device, or I am not correctly bringing it back to the host (cudaMemcpy()).
/*Code for a CUDA test project doing a basic dot product with doubles
*
*
*
*/
#include <stdio.h>
#include <cuda.h>
__global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *dot){
dot[0] += array_a[threadIdx.x] * array_b[threadIdx.x];
}
__global__ void GPU_parallelSetupVector(double *vector, int dim, int incrSize, int start){
if(threadIdx.x<dim){
vector[threadIdx.x] = start + threadIdx.x * incrSize;
}
}
__host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){
for(int i=0; i<dim; ++i){
dot[0] += first[i] * second[i];
}
}
__host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){
for(int i=0; i<dim; ++i){
vector[i] = start + i * incrSize;
}
}
int main(){
//define array size to be used
//int i,j;
int VECTOR_LENGTH = 8;
int ELEMENT_SIZE = sizeof(double);
//arrays for dot product
//host
double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product = (double*) malloc(ELEMENT_SIZE);
double host_dot_product = 0.0;
//fill with values
CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0);
CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0);
//host dot
CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH);
//device
double *dev_array_a;
double *dev_array_b;
double *dev_dot;
//allocate cuda memory
cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot, ELEMENT_SIZE);
//copy to from host to device
cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot, &dev_dot_product, ELEMENT_SIZE, cudaMemcpyHostToDevice);
//init vectors
//GPU_parallelSetupVector<<<1, VECTOR_LENGTH>>>(dev_array_a, VECTOR_LENGTH, 1, 0);
//GPU_parallelSetupVector<<<1, VECTOR_LENGTH>>>(dev_array_b, VECTOR_LENGTH, 1, 0);
//GPU_parallelSetupVector<<<1, 1>>>(dev_dot, VECTOR_LENGTH, 0, 0);
//perform CUDA dot product
GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_dot);
//get computed product back to the machine
cudaMemcpy(dev_dot, dev_dot_product, ELEMENT_SIZE, cudaMemcpyDeviceToHost);
FILE *output = fopen("test_dotProduct_1.txt", "w");
fprintf(output, "HOST CALCULATION: %f \n", host_dot_product);
fprintf(output, "DEV CALCULATION: %f \n", dev_dot_product[0]);
fprintf(output, "PRINTING DEV ARRAY VALS: ARRAY A\n");
for(int i=0; i<VECTOR_LENGTH; ++i){
fprintf(output, "value %i: %f\n", i, dev_array_a[i]);
}
free(array_a);
free(array_b);
cudaFree(dev_array_a);
cudaFree(dev_array_b);
cudaFree(dev_dot);
return(0);
}
Here is an example output:
HOST CALCULATION: 140.000000
DEV CALCULATION: 0.000000
PRINTING DEV ARRAY VALS: ARRAY A
value 0: -0.000000
value 1: 387096841637590350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 2: -9188929998371095800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 3: 242247762331550610000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 4: -5628111589595087500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 5: 395077289052074410000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 6: 0.000000
value 7: -13925691551991564000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
Upvotes: 0
Views: 1150
Reputation: 8487
It's a good idea to check the status of CUDA runtime calls like cudaMalloc, cudaMemcpy and kernel launches. You can do the following after every such call, or wrap this in some kind of a macro and wrap the CUDA runtime calls in the macro.
if (cudaSuccess != cudaGetLastError())
printf( "Error!\n" );
Now, I am not sure if this is your problem, but doing this can get the obvious out of the way.
Upvotes: 3
Reputation: 72349
There are two problems I can see:
Your GPU dot product contains a memory race here:
dot[0] += array_a[threadIdx.x] * array_b[threadIdx.x];
This is unsafe - every thread in the block will attempt to write/overwrite the same memory location with its result. The programming model makes no guarantees about what will happen in a case when multiple threads try and write a different value to the same piece of memory.
dev_array_a
is not directly accessible by the host, it is a
pointer in GPU memory. You must use a device to host copy to a valid
host location if you want to examine the contents of dev_array_a
.The suggestion about error checking made in another answer is also a very good point. Every API call returns a status and you should check the status of all calls you make to confirm that no errors or faults occur at runtime.
Upvotes: 4