Reputation: 3860
With dynamic parallelism in CUDA, you can launch kernels on the GPU side, starting from a certain version. I have a wrapper function that takes a pointer to the kernel I want to use, and it either does this on the CPU for older devices, or on the GPU for newer devices. For the fallback path it's fine, for the GPU it's not and says the memory alignment is incorrect.
Is there a way to do this in CUDA (7)? Are there some lower-level calls that will give me a pointer address that's correct on the GPU?
The code is below, the template "TFunc" is an attempt to get the compiler to do something different, but I've tried it strongly typed as well.
template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 320)
(*func)<< <1, 1 >> >(args...);
#else
printf("What are you doing here!?\n");
#endif
}
template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const systemInfo *sysInfo, int count, TArgs... args)
{
if(sysInfo->getCurrentDevice()->compareVersion("3.2") > 0)
{
printf("Iterate on GPU\n");
Test << <1, 1 >> >(kernel, count, args...);
}
else
{
printf("Iterate on CPU\n");
Test << <1, 1 >> >(kernel, count, args...);
}
}
Upvotes: 3
Views: 1419
Reputation: 151809
EDIT:
At the time that I originally wrote this answer, I believe the statements were correct: it was not possible to take a kernel address in host code. However I believe something has changed in CUDA since then, and so now (in CUDA 8, and maybe prior) it is possible to take a kernel address in host code (it's still not possible to take the address of a __device__
function in host code, however.)
ORIGINAL ANSWER:
It seems like this question comes up from time to time, although the previous examples I can think of have to do with calling __device__
functions instead of __global__
functions.
In general it's illegal to take the address of a device entity (variable, function) in host code.
One possible method to work around this (although the utility of this is not clear to me; it seems like there would be simpler dispatch mechanisms) is to extract the device address needed "in device code" and return that value to the host, for dispatch usage. In this case, I am creating a simple example that extracts the needed device addresses into __device__
variables, but you could also write a kernel to do this setup (i.e. to "give me a pointer address that's correct on the GPU" in your words).
Here's a rough worked example, building on the code you have shown:
$ cat t746.cu
#include <stdio.h>
__global__ void ckernel1(){
printf("hello1\n");
}
__global__ void ckernel2(){
printf("hello2\n");
}
__global__ void ckernel3(){
printf("hello3\n");
}
__device__ void (*pck1)() = ckernel1;
__device__ void (*pck2)() = ckernel2;
__device__ void (*pck3)() = ckernel3;
template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
(*func)<< <1, 1 >> >(args...);
#else
printf("What are you doing here!?\n");
#endif
}
template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
if(sysInfo >= 350)
{
printf("Iterate on GPU\n");
Test << <1, 1 >> >(kernel, count, args...);
}
else
{
printf("Iterate on CPU\n");
Test << <1, 1 >> >(kernel, count, args...);
}
}
int main(){
void (*h_ckernel1)();
void (*h_ckernel2)();
void (*h_ckernel3)();
cudaMemcpyFromSymbol(&h_ckernel1, pck1, sizeof(void *));
cudaMemcpyFromSymbol(&h_ckernel2, pck2, sizeof(void *));
cudaMemcpyFromSymbol(&h_ckernel3, pck3, sizeof(void *));
Iterate(h_ckernel1, 350, 1);
Iterate(h_ckernel2, 350, 1);
Iterate(h_ckernel3, 350, 1);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
Iterate on GPU
Iterate on GPU
hello1
hello2
hello3
========= ERROR SUMMARY: 0 errors
$
The above (__device__
variable) method probably can't be made to work with templated child kernels, but it might be possible to create a templated "extractor" kernel that returns the address of a (instantiated) templated child kernel. A rough idea of the "extractor" setup_kernel
method is given in the previous answer I linked. Here's a rough example of the templated child kernel/extractor kernel method:
$ cat t746.cu
#include <stdio.h>
template <typename T>
__global__ void ckernel1(T *data){
int my_val = (int)(*data+1);
printf("hello: %d \n", my_val);
}
template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
(*func)<< <1, 1 >> >(args...);
#else
printf("What are you doing here!?\n");
#endif
}
template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
if(sysInfo >= 350)
{
printf("Iterate on GPU\n");
Test << <1, 1 >> >(kernel, count, args...);
}
else
{
printf("Iterate on CPU\n");
Test << <1, 1 >> >(kernel, count, args...);
}
}
template <typename T>
__global__ void extractor(void (**kernel)(T *)){
*kernel = ckernel1<T>;
}
template <typename T>
void run_test(T init){
void (*h_ckernel1)(T *);
void (**d_ckernel1)(T *);
T *d_data;
cudaMalloc(&d_ckernel1, sizeof(void *));
cudaMalloc(&d_data, sizeof(T));
cudaMemcpy(d_data, &init, sizeof(T), cudaMemcpyHostToDevice);
extractor<<<1,1>>>(d_ckernel1);
cudaMemcpy((void *)&h_ckernel1, (void *)d_ckernel1, sizeof(void *), cudaMemcpyDeviceToHost);
Iterate(h_ckernel1, 350, 1, d_data);
cudaDeviceSynchronize();
cudaFree(d_ckernel1);
cudaFree(d_data);
return;
}
int main(){
run_test(1);
run_test(2.0f);
return 0;
}
$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
hello: 2
Iterate on GPU
hello: 3
========= ERROR SUMMARY: 0 errors
$
Upvotes: 3