CUDA - unexpected result with float array

Question

I faced an issue I do not understand. I am trying to set values of an array in the device. With int array I am doing this this way:

#define VECTOR_SIZE 8
int main()
{
    printf("Start
");
    int *input_d;
    int *output_d;
    int output_h[VECTOR_SIZE];
    int input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
    int size = VECTOR_SIZE*sizeof(int);
    cudaMalloc(&input_d,size);
    cudaMalloc(&output_d,size);
    cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
    kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
    cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
    cudaFree(input_d);
    cudaFree(output_d);
    return 0;
}

The kernel looks like:

__global__ void kernel(int* input, int* output)
{
    int dx = threadIdx.x + (blockDim.x * blockIdx.x);
    output[dx] = dx;
}

The output (output_h) is just like I expected {0, 1, 2, 3, 4, 5, 6, 7}. Now when I am trying do the same on float array:

#define VECTOR_SIZE 8
int main()
{
    printf("Start
");
    float *input_d;
    float *output_d;
    float output_h[VECTOR_SIZE];
    float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
    int size = VECTOR_SIZE*sizeof(float);
    cudaMalloc(&input_d,size);
    cudaMalloc(&output_d,size);
    cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
    kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
    cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
    cudaFree(input_d);
    cudaFree(output_d);
    return 0;
}

with kernel:

__global__ void kernel(float* input, float* output)
{
    int dx = threadIdx.x + (blockDim.x * blockIdx.x);
    output[dx] = dx;
}

I am receiving zero array on the device in output_h variable.

The full code for handling float arrays:

#include "cuda_runtime.h"
#include 

#define VECTOR_SIZE 8

__global__ void kernel(float* input, float* output)//, int halfSize)
{
    int dx = threadIdx.x + (blockDim.x * blockIdx.x);
    output[dx] = dx;
} 

int main()
{
    printf("Start
");
    float *input_d;
    float *output_d;
    float output_h[VECTOR_SIZE];
    float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 }; 
    int size = VECTOR_SIZE*sizeof(float);
    cudaMalloc(&input_d,size);
    cudaMalloc(&output_d,size);
    cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
    kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
    cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
    cudaFree(input_d);
    cudaFree(output_d);
    int i;
    for (i=1; i<=VECTOR_SIZE; i++)
    {
        printf("%d, ", output_h[i-1]);
    }
    getchar();
    return 0;
}

talonmies · Accepted Answer

Both the integer and floating point versions of CUDA code you have posted work perfectly. The only mistake is how you are printing out the values returned by the kernel in the case of the floating point code:

int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
    printf("%d, ", output_h[i-1]);
}

should be changed to

int i;
for (i=0; i



(note that the %f format is required for printing floating point numbers).

Given that CUDA uses a C++ compiler for host code by default, you should probably prefer iostream to printf - it will work irrespective of the type of the output and not cause the error you are seeing. If I were to write a "universal" version of your example it would look like this:

#include 

template
__global__ void kernel(T* output)
{
    int dx = threadIdx.x + (blockDim.x * blockIdx.x);
    output[dx] = dx;
}

template
void do_run(void)
{
    T *output_d;
    T output_h[VECTOR_SIZE] = { 999 };
    size_t size = sizeof(output_h);
    cudaMalloc(&output_d,size);
    kernel<<<1,VECTOR_SIZE>>>(output_d);
    cudaMemcpy(output_h, output_d, size, cudaMemcpyDeviceToHost);
    for(int i=0; i();

    std::cout << "floating point version" << std::endl;
    do_run();

    return 0;
}


Note that the output code can be used unchanged for both int  and float versions, eliminating the possibility of the mistake you made here.

CUDA - unexpected result with float array

Answers (1)

Related Questions