Reputation: 197
I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).
How do I create a global __device__
pointer to the member pointing to the device memory? I am happy to use thrust if that helps.
Here is a small example:
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
Container stuff;
__device__ auto d_int = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
Upvotes: 0
Views: 74
Reputation: 72339
There are two problems here:
__device__
variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.stuff
shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main
scope instead.Putting these two things together should lead your to do something like this instead:
__device__ int* d_int;
// ...
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));
edit<<<4, 1>>>();
// ...
Here is a fully worked example:
$ cat t1199.cu
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
//Container stuff;
__device__ int *d_int; // = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$
Upvotes: 3