Reputation: 1
I tried to make a array addition using a code below but ended up not adding the array and not error, this is a gpu (CUDA) based parallel program.
#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <stdlib.h>
using namespace std;
__global__ void AddInts(int *a, int *b, int count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < count)
{
a[id] += b[id];
}
}
int main()
{
srand(time(NULL));
int count = 100;
int *h_a = new int[count];
int *h_b = new int[count];
for (int i = 0; i < count; i++)
{
h_a[i] = rand() % 1000;
h_b[i] = rand() % 1000;
}
cout << "Prior to addition:" << endl;
for (int i = 0; i < 5; i++)
cout << h_a[i] << " " << h_b[i] << endl;
int *d_a, *d_b;
if (cudaMalloc(&d_a, sizeof(int) * count) != cudaSuccess)
{
cout << "Nope! No";
return 0;
}
if (cudaMalloc(&d_b, sizeof(int) * count) != cudaSuccess)
{
cout << "Nope!";
cudaFree(d_a);
return 0;
}
if (cudaMemcpy(d_a, h_a, sizeof(int) * count, cudaMemcpyHostToDevice) != cudaSuccess)
{
cout << "Could not copy!" << endl;
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
if (cudaMemcpy(d_b, h_b, sizeof(int) * count, cudaMemcpyHostToDevice) != cudaSuccess)
{
cout << "Could not copy!" << endl;
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
AddInts <<<count / 256 + 1, 256 >>> (h_a, h_b, count);
if (cudaMemcpy(h_a, h_b, sizeof(int) * count, cudaMemcpyDeviceToHost) == cudaSuccess)
{
delete[] h_a;
delete[] h_b;
cudaFree(d_a);
cudaFree(d_b);
cout << "Nope!" << endl;
return 0;
}
for (int i = 0; i < 5; i++)
cout << "It's " << h_a[i] << endl;
cudaFree(d_a);
cudaFree(d_b);
delete[] h_a;
delete[] h_b;
return 0;
}
My results are:
Prior to addition: 188 336 489 593 706 673 330 792 329 588 It's 188
It's 489 It's 706 It's 330 It's 329
D:\Learn\CUDA\Visual_stidio\matrxAdd\x64\Release\matrxAdd.exe (process
8468) exited with code 0. To automatically close the console when
debugging stops, enable Tools->Options->Debugging->Automatically close
the console when debugging stops. Press any key to close this window .
. .
Upvotes: 0
Views: 53
Reputation: 75062
Firstly, calculation on the device should use the device memory. Therefore, the line:
AddInts <<<count / 256 + 1, 256 >>> (h_a, h_b, count);
should be:
AddInts <<<count / 256 + 1, 256 >>> (d_a, d_b, count);
Then, you should copy the result from the device memory to the host memory. Therefore, the line:
if (cudaMemcpy(h_a, h_b, sizeof(int) * count, cudaMemcpyDeviceToHost) == cudaSuccess)
should be:
if (cudaMemcpy(h_a, d_a, sizeof(int) * count, cudaMemcpyDeviceToHost) == cudaSuccess)
Upvotes: 4