My Cuda script array output is wrong

Question

This weekend I am trying to learn Cuda.

What I want to do is c=a+b. Each of variable (a, b, and c) is an array with 5 elements.

I have problem with the result. This is my desired result:

{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 36.000000, 48.000000, 62.000000}

But this is what I got:

PS E:	esting\cuda2\Debug> .\cuda2.exe
{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 0.000000, 0.000000, 0.000000}

As you can see, the result (c3, c4, c5) is wrong.
Please tell me how to make the code below do the right thing.

I am using VS2015 and Cuda toolkit 8. There are 3 files I created in my project solution: main.cpp, simple_math.cu, simple_math.cuh;

main.cpp

#include "simple_math.cuh"
#include  // fprintf


int main()
{
    const int arraySize = 5;
    float a[arraySize] = { 11, 21, 31, 41, 51 };
    float b[arraySize] = { 1, 3, 5, 7, 11 };
    double c[arraySize] = { 0, 0, 0, 0, 0 };

    cudaError_t cudaStatus = mathWithCuda(c, a, b, arraySize, ADD);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "mathWithCuda failed!");
        return 1;
    }


    fprintf(stdout, "{a1, a2, a3, a4, a5} = {%f, %f, %f, %f, %f} 
{b1, b2, b3, b4, b5} = {%f, %f, %f, %f, %f} 
{c1, c2, c3, c4, c5} = {%f, %f, %f, %f, %f}",
        a[0], a[1], a[2], a[3], a[4], b[0], b[1], b[2], b[3], b[4], c[0], c[1], c[2], c[3], c[4]);


    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

simple_math.cuh

#ifndef SIMPLE_MATH_CUH_
#define SIMPLE_MATH_CUH_


#include  // cudaError_t

#define ADD 0
#define SUB 1
#define MUL 2
#define DIV 3


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int size, int mode);

__global__ void addKernel(double *c, const float *a, const float *b);
__global__ void subKernel(double *c, const float *a, const float *b);
__global__ void mulKernel(double *c, const float *a, const float *b);
__global__ void divKernel(double *c, const float *a, const float *b);


#endif

simple_math.cu

#include  // threadIdx
#include  // fprintf
#include  // ceil
#include "simple_math.cuh"


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int arraySize, int mode)
{
    float *dev_a, *dev_b;
    double *dev_c;
    cudaError_t cudaStatus;


    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }


    if ((cudaStatus = cudaMalloc((void**)&dev_c, arraySize * sizeof(double))) != cudaSuccess ||
        (cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(float))) != cudaSuccess ||
        (cudaStatus = cudaMalloc((void**)&dev_b, arraySize * sizeof(float))) != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }


    if ((cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess ||
        (cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }


    int blocksPerGrid, threadsPerBlock;
    if (arraySize < 512) {
        blocksPerGrid = 1;
        threadsPerBlock = arraySize;
    }
    else {
        blocksPerGrid = ceil(double(arraySize) / double(threadsPerBlock));
        threadsPerBlock = 512;
    }


    switch (mode)
    {
    case 0:
        addKernel <<>>(dev_c, dev_a, dev_b);
        break;
    case 1:
        subKernel <<>>(dev_c, dev_a, dev_b);
        break;
    case 2:
        mulKernel <<>>(dev_c, dev_a, dev_b);
        break;
    case 3:
        divKernel <<>>(dev_c, dev_a, dev_b);
        break;
    default:
        // nothing
        break;
    }


    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "Kernel launch failed: %s
", cudaGetErrorString(cudaStatus));
        goto Error;
    }


    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!
", cudaStatus);
        goto Error;
    }


    cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }


Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);


    return cudaStatus;
}


__global__ void addKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fadd_rn(a[i], b[i]); // a + b
}    

__global__ void subKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fsub_rn(a[i], b[i]); // a - b
}

__global__ void mulKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fmul_rn(a[i], b[i]); // a * b
}

__global__ void divKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fdividef(a[i], b[i]); // a/b
}

Alex · Accepted Answer

The issue seems to be here:

cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);

I thing that you supposed to copy arraySize * sizeof(double) bites.

My Cuda script array output is wrong

Answers (1)

Related Questions