Reputation: 13624
Suppose we have;
struct collapsed {
char **seq;
int num;
};
...
__device__ *collapsed xdev;
...
collapsed *x_dev
cudaGetSymbolAddress((void **)&x_dev, xdev);
cudaMemcpyToSymbol(x_dev, x, sizeof(collapsed)*size); //x already defined collapsed * , this line gives ERROR
Whay do you think I am getting error at the last line : invalid device symbol
??
Upvotes: 0
Views: 1492
Reputation: 72382
The first problem here is that x_dev
isn't a device symbol. It might contain an address in a device memory, but that address cannot be passed to cudaMemcpyToSymbol
. The call should just be:
cudaMemcpyToSymbol(xdev, ......);
Which brings up the second problem. Doing this:
cudaMemcpyToSymbol(xdev, x, sizeof(collapsed)*size);
would be illegal. xdev
is a pointer, so the only valid value you can copy to xdev
is a device address. If x
is the address of a struct collapsed
in device memory, then the only valid version of this memory transfer operation is
cudaMemcpyToSymbol(xdev, &x, sizeof(collapsed *));
ie. x
must have previously have been set to the address of memory allocated in the device, something like
collapsed *x;
cudaMalloc((void **)&x, sizeof(collapsed)*size);
cudaMemcpy(x, host_src, sizeof(collapsed)*size, cudaMemcpyHostToDevice);
As promised, here is a complete working example. First the code:
#include <cstdlib>
#include <iostream>
#include <cuda_runtime.h>
struct collapsed {
char **seq;
int num;
};
__device__ collapsed xdev;
__global__
void kernel(const size_t item_sz)
{
if (threadIdx.x < xdev.num) {
char *p = xdev.seq[threadIdx.x];
char val = 0x30 + threadIdx.x;
for(size_t i=0; i<item_sz; i++) {
p[i] = val;
}
}
}
#define gpuQ(ans) { gpu_assert((ans), __FILE__, __LINE__); }
void gpu_assert(cudaError_t code, const char *file, const int line)
{
if (code != cudaSuccess)
{
std::cerr << "gpu_assert: " << cudaGetErrorString(code) << " "
<< file << " " << line << std::endl;
exit(code);
}
}
int main(void)
{
const int nitems = 32;
const size_t item_sz = 16;
const size_t buf_sz = size_t(nitems) * item_sz;
// Gpu memory for sequences
char *_buf;
gpuQ( cudaMalloc((void **)&_buf, buf_sz) );
gpuQ( cudaMemset(_buf, 0x7a, buf_sz) );
// Host array for holding sequence device pointers
char **seq = new char*[nitems];
size_t offset = 0;
for(int i=0; i<nitems; i++, offset += item_sz) {
seq[i] = _buf + offset;
}
// Device array holding sequence pointers
char **_seq;
size_t seq_sz = sizeof(char*) * size_t(nitems);
gpuQ( cudaMalloc((void **)&_seq, seq_sz) );
gpuQ( cudaMemcpy(_seq, seq, seq_sz, cudaMemcpyHostToDevice) );
// Host copy of the xdev structure to copy to the device
collapsed xdev_host;
xdev_host.num = nitems;
xdev_host.seq = _seq;
// Copy to device symbol
gpuQ( cudaMemcpyToSymbol(xdev, &xdev_host, sizeof(collapsed)) );
// Run Kernel
kernel<<<1,nitems>>>(item_sz);
// Copy back buffer
char *buf = new char[buf_sz];
gpuQ( cudaMemcpy(buf, _buf, buf_sz, cudaMemcpyDeviceToHost) );
// Print out seq values
// Each string should be ASCII starting from ´0´ (0x30)
char *seq_vals = buf;
for(int i=0; i<nitems; i++, seq_vals += item_sz) {
std::string s;
s.append(seq_vals, item_sz);
std::cout << s << std::endl;
}
return 0;
}
and here it is compiled and run:
$ /usr/local/cuda/bin/nvcc -arch=sm_12 -Xptxas=-v -g -G -o erogol erogol.cu
./erogol.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space
ptxas info : 8 bytes gmem, 4 bytes cmem[14]
ptxas info : Compiling entry function '_Z6kernelm' for 'sm_12'
ptxas info : Used 5 registers, 20 bytes smem, 4 bytes cmem[1]
$ /usr/local/cuda/bin/cuda-memcheck ./erogol
========= CUDA-MEMCHECK
0000000000000000
1111111111111111
2222222222222222
3333333333333333
4444444444444444
5555555555555555
6666666666666666
7777777777777777
8888888888888888
9999999999999999
::::::::::::::::
;;;;;;;;;;;;;;;;
<<<<<<<<<<<<<<<<
================
>>>>>>>>>>>>>>>>
????????????????
@@@@@@@@@@@@@@@@
AAAAAAAAAAAAAAAA
BBBBBBBBBBBBBBBB
CCCCCCCCCCCCCCCC
DDDDDDDDDDDDDDDD
EEEEEEEEEEEEEEEE
FFFFFFFFFFFFFFFF
GGGGGGGGGGGGGGGG
HHHHHHHHHHHHHHHH
IIIIIIIIIIIIIIII
JJJJJJJJJJJJJJJJ
KKKKKKKKKKKKKKKK
LLLLLLLLLLLLLLLL
MMMMMMMMMMMMMMMM
NNNNNNNNNNNNNNNN
OOOOOOOOOOOOOOOO
========= ERROR SUMMARY: 0 errors
Some notes:
_buf
to hold all of the string data. Each value of seq
is set to a different address within _buf
. This is functionally equivalent to running a separate cudaMalloc
call for each pointer, but much faster.xdev_host
are device pointers. The CUDA API doesn't have any sort of deep copy or automatic pointer translation facility, so it is the programmer's responsibility to make sure this is correct.__device__
symbol (again to simplify things slightly). But otherwise the sequence of operations is what you would need to make your design pattern work._buf
, I can transfer all the sequences back to the host with a single cudaMemcpy call. Upvotes: 2