Using host class member pointing to device memory in device code

Question

I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).

How do I create a global __device__ pointer to the member pointing to the device memory? I am happy to use thrust if that helps.

Here is a small example:

#include 


struct Container {
    int *h_int = (int*)malloc(4*sizeof(int));
    int *d_int;
    Container() {
        h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
        cudaMalloc(&d_int, 4*sizeof(int));
        memcpyHostToDevice();
    }
    void memcpyHostToDevice() {
        cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
    }
    void memcpyDeviceToHost() {
        cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
    }
};

Container stuff;


__device__ auto d_int = &stuff.d_int;  // How do I get that right?


__global__ void edit() {  // To keep the API simple I do not want to pass the pointer
    auto i = blockIdx.x*blockDim.x + threadIdx.x;
    d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}


int main(int argc, char const *argv[]) {
    edit<<<4, 1>>>();
    stuff.memcpyDeviceToHost();
    std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "
";
    return 0;
}

talonmies · Accepted Answer

There are two problems here:

You can't statically inititalize a __device__ variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.
Your global scope declaration of stuff shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main scope instead.

Putting these two things together should lead your to do something like this instead:

__device__ int* d_int;

// ...

int main(int argc, char const *argv[]) {

    Container stuff;
    cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));

    edit<<<4, 1>>>();

    // ...

Here is a fully worked example:

$ cat t1199.cu
#include 


struct Container {
    int *h_int = (int*)malloc(4*sizeof(int));
    int *d_int;
    Container() {
        h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
        cudaMalloc(&d_int, 4*sizeof(int));
        memcpyHostToDevice();
    }
    void memcpyHostToDevice() {
        cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
    }
    void memcpyDeviceToHost() {
        cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
    }
};

//Container stuff;


__device__ int  *d_int; // = &stuff.d_int;  // How do I get that right?


__global__ void edit() {  // To keep the API simple I do not want to pass the pointer
    auto i = blockIdx.x*blockDim.x + threadIdx.x;
    d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}


int main(int argc, char const *argv[]) {
    Container stuff;
    cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
    edit<<<4, 1>>>();
    stuff.memcpyDeviceToHost();
    std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "
";
    return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$

Using host class member pointing to device memory in device code

Answers (1)

Related Questions