konovification
konovification

Reputation: 33

Integrating a CUDA class into a C++ program

I have a C++ project where I need to integrate some CUDA into an existing codebase to take advantage of cuSPARSE and cuBLAS. The CUDA portion of the code would be called fairly often (like inside a for loop), so if I were to use:

extern "C" void doStuff(int N, double *arg1, double* arg2);

as is suggested by cppIntegration in NVIDIA Sample codes, things I would ideally like to keep on device memory between calls would have be transferred at each call. Is there any way around that? Is it possible to create a class for CUDA stuff that lives during program lifetime and call something like

extern "C" class stuffHandler;

and then from my C++ program invoke:

stuff_handler.doStuff();

Upvotes: 0

Views: 1061

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 152259

I have a C++ project

Is it possible to create a class for CUDA stuff that lives during program lifetime

Sure. I'm not sure what the extern "C" stuff has to do with a C++ project. CUDA is a C++ type of language definition.

Here is an example:

$ cat t1901.cu
#include <iostream>
#include <vector>

__global__ void my_work(int N, double *X, double *Y, double *scale){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < N)
    Y[idx] += X[idx] * (*scale);
}

class stuffHandler
{
  double persistent;
  double *dev_persistent = NULL;
  public:
    stuffHandler(double persistent_) : persistent(persistent_) {
      cudaMalloc(&dev_persistent, sizeof(double));
      cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
    void doStuff(int N, double *arg1, double *arg2){
      double *d_arg1, *d_arg2;
      cudaMalloc(&d_arg1, N*sizeof(double));
      cudaMalloc(&d_arg2, N*sizeof(double));
      cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
      my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
      cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
      cudaFree(d_arg1);
      cudaFree(d_arg2);}
    ~stuffHandler(){if (dev_persistent) cudaFree(dev_persistent);}
};

int main(){
  int my_N = 4;
  double scale = 1.5;
  stuffHandler stuff_handler(scale);
  std::vector<double> v1(my_N, 0.1);
  std::vector<double> v2(my_N, 0.2);
  stuff_handler.doStuff(my_N, v1.data(), v2.data());
  std::cout << v2[0] << std::endl;
}
$ nvcc -o t1901 t1901.cu
$ compute-sanitizer ./t1901
========= COMPUTE-SANITIZER
0.35
========= ERROR SUMMARY: 0 errors
$

Responding to a question in the comments, you could rearrange the above as follows for a typical multi-module project implementation:

$ cat t1901.h

class stuffHandler
{
  double persistent;
  double *dev_persistent = NULL;
  public:
    stuffHandler(double persistent_);
    void doStuff(int N, double *arg1, double *arg2);
    ~stuffHandler();
};

$ cat t1901.cu
#include "t1901.h"

__global__ void my_work(int N, double *X, double *Y, double *scale){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < N)
    Y[idx] += X[idx] * (*scale);
}

stuffHandler::stuffHandler(double persistent_) : persistent(persistent_) {
      cudaMalloc(&dev_persistent, sizeof(double));
      cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
void stuffHandler::doStuff(int N, double *arg1, double *arg2){
      double *d_arg1, *d_arg2;
      cudaMalloc(&d_arg1, N*sizeof(double));
      cudaMalloc(&d_arg2, N*sizeof(double));
      cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
      my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
      cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
      cudaFree(d_arg1);
      cudaFree(d_arg2);}
stuffHandler::~stuffHandler(){if (dev_persistent) cudaFree(dev_persistent);}
$ cat main.cpp
#include <iostream>
#include <vector>
#include "t1901.h"

int main(){
  int my_N = 4;
  double scale = 1.5;
  stuffHandler stuff_handler(scale);
  std::vector<double> v1(my_N, 0.1);
  std::vector<double> v2(my_N, 0.2);
  stuff_handler.doStuff(my_N, v1.data(), v2.data());
  std::cout << v2[0] << std::endl;
}
$ nvcc -o t1901 t1901.cu main.cpp
$ compute-sanitizer ./t1901
========= COMPUTE-SANITIZER
0.35
========= ERROR SUMMARY: 0 errors
$

If you wanted to break the compilation and linking steps apart, you could do:

nvcc -c t1901.cu
g++ -c main.cpp
g++ main.o t1901.o -o test -L/usr/local/cuda/lib64 -lcudart

Or in an MPI project, replace the above g++ with mpicxx

After some additional discussion in the comments, contrary to your question title and first sentence, you actually have a C project, not C++ (you are wanting to do the final link with mpicc which is a C compiler).

In that case we could lay out the above code somewhat differently, and refer to some instructions here for getting all of our C++ linking in order. Here is another example:

$ cat t1902.h
#ifdef __cplusplus
extern "C"
#endif
void C_init(double scale);
#ifdef __cplusplus
extern "C"
#endif
void C_doStuff(int N, double *arg1, double *arg2);
#ifdef __cplusplus
extern "C"
#endif
void C_end();
$ cat t1902.cu
#include <iostream>
#include <vector>

__global__ void my_work(int N, double *X, double *Y, double *scale){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < N)
    Y[idx] += X[idx] * (*scale);
}

class stuffHandler
{
  double persistent;
  double *dev_persistent = NULL;
  public:
    stuffHandler(double persistent_) : persistent(persistent_) {
      cudaMalloc(&dev_persistent, sizeof(double));
      cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
    void doStuff(int N, double *arg1, double *arg2){
      double *d_arg1, *d_arg2;
      cudaMalloc(&d_arg1, N*sizeof(double));
      cudaMalloc(&d_arg2, N*sizeof(double));
      cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
      my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
      cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
      cudaFree(d_arg1);
      cudaFree(d_arg2);}
    void finish(){if (dev_persistent) cudaFree(dev_persistent);}
};

stuffHandler *stuff_handler = NULL;
extern "C" void C_doStuff(int N, double *arg1, double *arg2){
  if (stuff_handler)
    stuff_handler->doStuff(N, arg1, arg2);
}
extern "C" void C_end(){
  if (stuff_handler) {
    stuff_handler->finish();
    delete stuff_handler;}
  stuff_handler = NULL;
}
extern "C" void C_init(double scale){
  if (stuff_handler) C_end();
  stuff_handler = new stuffHandler(scale);
}
$ cat main.c
#include <stdio.h>
#include <stdlib.h>
#include "t1902.h"

int main(){
  int i,my_N = 4;
  double scale = 1.5;
  C_init(scale);
  double *d1 = malloc(my_N*sizeof(double));
  double *d2 = malloc(my_N*sizeof(double));
  for (i=0; i < my_N; i++) {
    d1[i] = 0.1;
    d2[i] = 0.2;}
  C_doStuff(my_N, d1, d2);
  printf("%f\n", d2[0]);
  C_end();
}
$ nvcc -c t1902.cu
$ gcc -c main.c
$ gcc -o test main.o t1902.o -L/usr/local/cuda/lib64 -lcudart_static -lculibos -lpthread -lrt -ldl -lstdc++
$ compute-sanitizer ./test
========= COMPUTE-SANITIZER
0.350000
========= ERROR SUMMARY: 0 errors
$

In the compilation sequence above, it should be possible to replace gcc with mpicc.

Upvotes: 2

Related Questions