Function pointer (to other kernel) as kernel arg in CUDA

Question

With dynamic parallelism in CUDA, you can launch kernels on the GPU side, starting from a certain version. I have a wrapper function that takes a pointer to the kernel I want to use, and it either does this on the CPU for older devices, or on the GPU for newer devices. For the fallback path it's fine, for the GPU it's not and says the memory alignment is incorrect.

Is there a way to do this in CUDA (7)? Are there some lower-level calls that will give me a pointer address that's correct on the GPU?

The code is below, the template "TFunc" is an attempt to get the compiler to do something different, but I've tried it strongly typed as well.

template 
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 320)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?
");
#endif
}

template 
__host__ void Iterate(void(*kernel)(TArgs...), const systemInfo *sysInfo, int count, TArgs... args)
{
    if(sysInfo->getCurrentDevice()->compareVersion("3.2") > 0)
    {
        printf("Iterate on GPU
");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU
");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}

Robert Crovella · Accepted Answer

EDIT: At the time that I originally wrote this answer, I believe the statements were correct: it was not possible to take a kernel address in host code. However I believe something has changed in CUDA since then, and so now (in CUDA 8, and maybe prior) it is possible to take a kernel address in host code (it's still not possible to take the address of a __device__ function in host code, however.)

ORIGINAL ANSWER:

It seems like this question comes up from time to time, although the previous examples I can think of have to do with calling __device__ functions instead of __global__ functions.

In general it's illegal to take the address of a device entity (variable, function) in host code.

One possible method to work around this (although the utility of this is not clear to me; it seems like there would be simpler dispatch mechanisms) is to extract the device address needed "in device code" and return that value to the host, for dispatch usage. In this case, I am creating a simple example that extracts the needed device addresses into __device__ variables, but you could also write a kernel to do this setup (i.e. to "give me a pointer address that's correct on the GPU" in your words).

Here's a rough worked example, building on the code you have shown:

$ cat t746.cu
#include 

__global__ void ckernel1(){

  printf("hello1
");
}
__global__ void ckernel2(){

  printf("hello2
");
}
__global__ void ckernel3(){

  printf("hello3
");
}

__device__ void (*pck1)() = ckernel1;
__device__ void (*pck2)() = ckernel2;
__device__ void (*pck3)() = ckernel3;

template 
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?
");
#endif
}

template 
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
    if(sysInfo >= 350)
    {
        printf("Iterate on GPU
");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU
");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}


int main(){

  void (*h_ckernel1)();
  void (*h_ckernel2)();
  void (*h_ckernel3)();
  cudaMemcpyFromSymbol(&h_ckernel1, pck1, sizeof(void *));
  cudaMemcpyFromSymbol(&h_ckernel2, pck2, sizeof(void *));
  cudaMemcpyFromSymbol(&h_ckernel3, pck3, sizeof(void *));
  Iterate(h_ckernel1, 350, 1);
  Iterate(h_ckernel2, 350, 1);
  Iterate(h_ckernel3, 350, 1);
  cudaDeviceSynchronize();
  return 0;
}

$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
Iterate on GPU
Iterate on GPU
hello1
hello2
hello3
========= ERROR SUMMARY: 0 errors
$

The above (__device__ variable) method probably can't be made to work with templated child kernels, but it might be possible to create a templated "extractor" kernel that returns the address of a (instantiated) templated child kernel. A rough idea of the "extractor" setup_kernel method is given in the previous answer I linked. Here's a rough example of the templated child kernel/extractor kernel method:

$ cat t746.cu
#include 

template 
__global__ void ckernel1(T *data){

  int my_val = (int)(*data+1);
  printf("hello: %d 
", my_val);
}
template 
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?
");
#endif
}

template 
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
    if(sysInfo >= 350)
    {
        printf("Iterate on GPU
");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU
");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}

template 
__global__ void extractor(void (**kernel)(T *)){

  *kernel = ckernel1;
}

template 
void run_test(T init){

  void (*h_ckernel1)(T *);
  void (**d_ckernel1)(T *);
  T *d_data;
  cudaMalloc(&d_ckernel1, sizeof(void *));
  cudaMalloc(&d_data, sizeof(T));
  cudaMemcpy(d_data, &init, sizeof(T), cudaMemcpyHostToDevice);
  extractor<<<1,1>>>(d_ckernel1);
  cudaMemcpy((void *)&h_ckernel1, (void *)d_ckernel1, sizeof(void *), cudaMemcpyDeviceToHost);
  Iterate(h_ckernel1, 350, 1, d_data);
  cudaDeviceSynchronize();
  cudaFree(d_ckernel1);
  cudaFree(d_data);
  return;
}

int main(){

  run_test(1);
  run_test(2.0f);

  return 0;
}

$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
hello: 2
Iterate on GPU
hello: 3
========= ERROR SUMMARY: 0 errors
$

Function pointer (to other kernel) as kernel arg in CUDA

Answers (1)

Related Questions