Reputation: 304
I faced an issue I do not understand. I am trying to set values of an array in the device. With int array I am doing this this way:
#define VECTOR_SIZE 8
int main()
{
printf("Start\n");
int *input_d;
int *output_d;
int output_h[VECTOR_SIZE];
int input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(int);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
return 0;
}
The kernel looks like:
__global__ void kernel(int* input, int* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
The output (output_h) is just like I expected {0, 1, 2, 3, 4, 5, 6, 7}. Now when I am trying do the same on float array:
#define VECTOR_SIZE 8
int main()
{
printf("Start\n");
float *input_d;
float *output_d;
float output_h[VECTOR_SIZE];
float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(float);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
return 0;
}
with kernel:
__global__ void kernel(float* input, float* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
I am receiving zero array on the device in output_h variable.
The full code for handling float arrays:
#include "cuda_runtime.h"
#include <stdio.h>
#define VECTOR_SIZE 8
__global__ void kernel(float* input, float* output)//, int halfSize)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
int main()
{
printf("Start\n");
float *input_d;
float *output_d;
float output_h[VECTOR_SIZE];
float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(float);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
printf("%d, ", output_h[i-1]);
}
getchar();
return 0;
}
Upvotes: 1
Views: 2150
Reputation: 72349
Both the integer and floating point versions of CUDA code you have posted work perfectly. The only mistake is how you are printing out the values returned by the kernel in the case of the floating point code:
int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
printf("%d, ", output_h[i-1]);
}
should be changed to
int i;
for (i=0; i<VECTOR_SIZE; i++)
{
printf("%f, ", output_h[i]);
}
(note that the %f
format is required for printing floating point numbers).
Given that CUDA uses a C++ compiler for host code by default, you should probably prefer iostream
to printf
- it will work irrespective of the type of the output and not cause the error you are seeing. If I were to write a "universal" version of your example it would look like this:
#include <iostream>
template<typename T>
__global__ void kernel(T* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
template<typename T, int VECTOR_SIZE>
void do_run(void)
{
T *output_d;
T output_h[VECTOR_SIZE] = { 999 };
size_t size = sizeof(output_h);
cudaMalloc(&output_d,size);
kernel<T><<<1,VECTOR_SIZE>>>(output_d);
cudaMemcpy(output_h, output_d, size, cudaMemcpyDeviceToHost);
for(int i=0; i<VECTOR_SIZE; i++)
std::cout << output_h[i] << std::endl;
cudaFree(output_d);
}
int main()
{
std::cout << "Integer version" << std::endl;
do_run<int, 8>();
std::cout << "floating point version" << std::endl;
do_run<float, 8>();
return 0;
}
Note that the output code can be used unchanged for both int
and float
versions, eliminating the possibility of the mistake you made here.
Upvotes: 4