Reputation: 31
This weekend I am trying to learn Cuda.
What I want to do is c=a+b. Each of variable (a, b, and c) is an array with 5 elements.
I have problem with the result. This is my desired result:
{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 36.000000, 48.000000, 62.000000}
But this is what I got:
PS E:\testing\cuda2\Debug> .\cuda2.exe
{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 0.000000, 0.000000, 0.000000}
As you can see, the result (c3, c4, c5) is wrong.
Please tell me how to make the code below do the right thing.
I am using VS2015 and Cuda toolkit 8. There are 3 files I created in my project solution: main.cpp, simple_math.cu, simple_math.cuh;
main.cpp
#include "simple_math.cuh"
#include <iostream> // fprintf
int main()
{
const int arraySize = 5;
float a[arraySize] = { 11, 21, 31, 41, 51 };
float b[arraySize] = { 1, 3, 5, 7, 11 };
double c[arraySize] = { 0, 0, 0, 0, 0 };
cudaError_t cudaStatus = mathWithCuda(c, a, b, arraySize, ADD);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "mathWithCuda failed!");
return 1;
}
fprintf(stdout, "{a1, a2, a3, a4, a5} = {%f, %f, %f, %f, %f} \n{b1, b2, b3, b4, b5} = {%f, %f, %f, %f, %f} \n{c1, c2, c3, c4, c5} = {%f, %f, %f, %f, %f}",
a[0], a[1], a[2], a[3], a[4], b[0], b[1], b[2], b[3], b[4], c[0], c[1], c[2], c[3], c[4]);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
simple_math.cuh
#ifndef SIMPLE_MATH_CUH_
#define SIMPLE_MATH_CUH_
#include <cuda_runtime.h> // cudaError_t
#define ADD 0
#define SUB 1
#define MUL 2
#define DIV 3
cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int size, int mode);
__global__ void addKernel(double *c, const float *a, const float *b);
__global__ void subKernel(double *c, const float *a, const float *b);
__global__ void mulKernel(double *c, const float *a, const float *b);
__global__ void divKernel(double *c, const float *a, const float *b);
#endif
simple_math.cu
#include <device_launch_parameters.h> // threadIdx
#include <stdio.h> // fprintf
#include <math.h> // ceil
#include "simple_math.cuh"
cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int arraySize, int mode)
{
float *dev_a, *dev_b;
double *dev_c;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
if ((cudaStatus = cudaMalloc((void**)&dev_c, arraySize * sizeof(double))) != cudaSuccess ||
(cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(float))) != cudaSuccess ||
(cudaStatus = cudaMalloc((void**)&dev_b, arraySize * sizeof(float))) != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
if ((cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess ||
(cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
int blocksPerGrid, threadsPerBlock;
if (arraySize < 512) {
blocksPerGrid = 1;
threadsPerBlock = arraySize;
}
else {
blocksPerGrid = ceil(double(arraySize) / double(threadsPerBlock));
threadsPerBlock = 512;
}
switch (mode)
{
case 0:
addKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
break;
case 1:
subKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
break;
case 2:
mulKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
break;
case 3:
divKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
break;
default:
// nothing
break;
}
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
__global__ void addKernel(double *c, const float *a, const float *b)
{
int i = threadIdx.x;
c[i] = __fadd_rn(a[i], b[i]); // a + b
}
__global__ void subKernel(double *c, const float *a, const float *b)
{
int i = threadIdx.x;
c[i] = __fsub_rn(a[i], b[i]); // a - b
}
__global__ void mulKernel(double *c, const float *a, const float *b)
{
int i = threadIdx.x;
c[i] = __fmul_rn(a[i], b[i]); // a * b
}
__global__ void divKernel(double *c, const float *a, const float *b)
{
int i = threadIdx.x;
c[i] = __fdividef(a[i], b[i]); // a/b
}
Upvotes: 0
Views: 44
Reputation: 10126
The issue seems to be here:
cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
I thing that you supposed to copy arraySize * sizeof(double)
bites.
Upvotes: 2