LSPII
LSPII

Reputation: 21

CUDA C++ read image from host and copy to device

I need read a image and store it into a unsigned char array and use the array to construct a class. The class construction is device function. so I need read the image and copy to device. The code is similar to below.

    __global__ void print_copy_result(unsigned char **tdt)
{
    if (threadIdx.x == 0 && blockIdx.x == 0) 
    {
        printf("%c\t%c\t%c\t", (*tdt)[0], (*tdt)[1], (*tdt)[2]);
    }
}

int main()
{
    int inx, iny, inn;
    unsigned char* texture_data = stbi_load("60847663_p0.jpg", &inx, &iny, &inn, 0);
    printf("%hhc, %hhc, %hhc\n", texture_data[0], texture_data[1], texture_data[2]);

    unsigned char* d_texture_data;

    checkCudaErrors(cudaMallocManaged(&d_texture_data, inx * iny * inn * sizeof(unsigned char)));
    checkCudaErrors(cudaMemcpy(d_texture_data, texture_data, inx * iny * inn * sizeof(unsigned char), cudaMemcpyDefault));
    print_copy_result<<<1, 1>>>(&d_texture_data);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    return 0;
}

but I get CUDA error = 700 at checkCudaErrors(cudaDeviceSynchronize()); which step did I make wrong?

Upvotes: 1

Views: 1136

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151869

As indicated in the comments, &d_texture_data is a pointer to host memory (not managed memory, but host memory). Such a pointer to host memory is essentially unusable by CUDA device code (CUDA kernel code cannot dereference such host memory pointers, except in some cases on Power9 platforms).

You don't need that level of indirection anyway. The most direct approach would be to use a methodology similar to what is shown here and just pass the "ordinary" managed pointer to your kernel. Since we're getting rid of the double-pointer approach, there are changes needed to the kernel also:

$ cat t54.cu
#include <cstdio>
#include <helper_cuda.h>

    __global__ void print_copy_result(unsigned char *tdt)
{
    if (threadIdx.x == 0 && blockIdx.x == 0)
    {
        printf("%c\t%c\t%c\t", tdt[0], tdt[1], tdt[2]);
    }
}

int main()
{
    int inx, iny, inn;
    const unsigned char texture_data[] = {'a', 'b', 'c'};
    printf("%hhc, %hhc, %hhc\n", texture_data[0], texture_data[1], texture_data[2]);
    inx = sizeof(texture_data)/sizeof(texture_data[0]);
    iny = 1;
    inn = 1;
    unsigned char* d_texture_data;

    checkCudaErrors(cudaMallocManaged(&d_texture_data, inx * iny * inn * sizeof(unsigned char)));
    checkCudaErrors(cudaMemcpy(d_texture_data, texture_data, inx * iny * inn * sizeof(unsigned char), cudaMemcpyDefault));
    print_copy_result<<<1, 1>>>(d_texture_data);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    printf("\n");
    return 0;
}
$ nvcc -o t54 t54.cu -arch=sm_35 -I/usr/local/cuda/samples/common/inc  -Wno-deprecated-gpu-targets
$ cuda-memcheck ./t54
========= CUDA-MEMCHECK
a, b, c
a       b       c
========= ERROR SUMMARY: 0 errors
$

Upvotes: 1

Related Questions