Reputation: 639
I was doing a cuda tutorial in which I have to make the dot product of two vectors. After implementing the solution provided in the tutorial I came across some issues that were solved in this stack overflow post. Now I am receiving the answer 0 regardless what I do. Bellow you can find the code!
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_atomic_functions.h"
#include <stdio.h>
#include <stdlib.h>
#define N (2048 * 8)
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c)
{
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if (threadIdx.x == 0)
{
int sum = 0;
for (int i = 0; i < N; i++)
{
sum += temp[i];
}
atomicAdd(c, sum);
}
}
int main()
{
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
int size = N * sizeof(int);
//allocate space for the variables on the device
cudaMalloc((void **)&dev_a, size);
cudaMalloc((void **)&dev_b, size);
cudaMalloc((void **)&dev_c, sizeof(int));
//allocate space for the variables on the host
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(sizeof(int));
//this is our ground truth
int sumTest = 0;
//generate numbers
for (int i = 0; i < N; i++)
{
a[i] = rand() % 10;
b[i] = rand() % 10;
sumTest += a[i] * b[i];
printf(" %d %d \n",a[i],b[i]);
}
*c = 0;
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);
dot<<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >> >(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d ", *c);
printf("%d ", sumTest);
free(a);
free(b);
free(c);
cudaFree(a);
cudaFree(b);
cudaFree(c);
system("pause");
return 0;
}
Upvotes: 3
Views: 13381
Reputation: 16796
First of all, please add CUDA error checking in the code as described in this legendary post.
Just before the kernel execution call, you are copying extra memory into dev_c
in the following line:
cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);
It should be:
cudaMemcpy(dev_c, c, sizeof(int), cudaMemcpyHostToDevice);
Another error in the code is that inside the kernel, __shared__
memory variable temp
is being accessed out of bounds in the for loop. Number of elements of the shared memory is equal to THREADS_PER_BLOCK
while the loop is being iterated upto N
. Just replace N
with THREADS_PER_BLOCK
in the loop.
Upvotes: 3