Reputation: 33
I have a C++ project where I need to integrate some CUDA into an existing codebase to take advantage of cuSPARSE and cuBLAS. The CUDA portion of the code would be called fairly often (like inside a for
loop), so if I were to use:
extern "C" void doStuff(int N, double *arg1, double* arg2);
as is suggested by cppIntegration
in NVIDIA Sample codes, things I would ideally like to keep on device memory between calls would have be transferred at each call. Is there any way around that? Is it possible to create a class for CUDA stuff that lives during program lifetime and call something like
extern "C" class stuffHandler;
and then from my C++ program invoke:
stuff_handler.doStuff();
Upvotes: 0
Views: 1061
Reputation: 152259
I have a C++ project
Is it possible to create a class for CUDA stuff that lives during program lifetime
Sure. I'm not sure what the extern "C"
stuff has to do with a C++ project. CUDA is a C++ type of language definition.
Here is an example:
$ cat t1901.cu
#include <iostream>
#include <vector>
__global__ void my_work(int N, double *X, double *Y, double *scale){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < N)
Y[idx] += X[idx] * (*scale);
}
class stuffHandler
{
double persistent;
double *dev_persistent = NULL;
public:
stuffHandler(double persistent_) : persistent(persistent_) {
cudaMalloc(&dev_persistent, sizeof(double));
cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
void doStuff(int N, double *arg1, double *arg2){
double *d_arg1, *d_arg2;
cudaMalloc(&d_arg1, N*sizeof(double));
cudaMalloc(&d_arg2, N*sizeof(double));
cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_arg1);
cudaFree(d_arg2);}
~stuffHandler(){if (dev_persistent) cudaFree(dev_persistent);}
};
int main(){
int my_N = 4;
double scale = 1.5;
stuffHandler stuff_handler(scale);
std::vector<double> v1(my_N, 0.1);
std::vector<double> v2(my_N, 0.2);
stuff_handler.doStuff(my_N, v1.data(), v2.data());
std::cout << v2[0] << std::endl;
}
$ nvcc -o t1901 t1901.cu
$ compute-sanitizer ./t1901
========= COMPUTE-SANITIZER
0.35
========= ERROR SUMMARY: 0 errors
$
Responding to a question in the comments, you could rearrange the above as follows for a typical multi-module project implementation:
$ cat t1901.h
class stuffHandler
{
double persistent;
double *dev_persistent = NULL;
public:
stuffHandler(double persistent_);
void doStuff(int N, double *arg1, double *arg2);
~stuffHandler();
};
$ cat t1901.cu
#include "t1901.h"
__global__ void my_work(int N, double *X, double *Y, double *scale){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < N)
Y[idx] += X[idx] * (*scale);
}
stuffHandler::stuffHandler(double persistent_) : persistent(persistent_) {
cudaMalloc(&dev_persistent, sizeof(double));
cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
void stuffHandler::doStuff(int N, double *arg1, double *arg2){
double *d_arg1, *d_arg2;
cudaMalloc(&d_arg1, N*sizeof(double));
cudaMalloc(&d_arg2, N*sizeof(double));
cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_arg1);
cudaFree(d_arg2);}
stuffHandler::~stuffHandler(){if (dev_persistent) cudaFree(dev_persistent);}
$ cat main.cpp
#include <iostream>
#include <vector>
#include "t1901.h"
int main(){
int my_N = 4;
double scale = 1.5;
stuffHandler stuff_handler(scale);
std::vector<double> v1(my_N, 0.1);
std::vector<double> v2(my_N, 0.2);
stuff_handler.doStuff(my_N, v1.data(), v2.data());
std::cout << v2[0] << std::endl;
}
$ nvcc -o t1901 t1901.cu main.cpp
$ compute-sanitizer ./t1901
========= COMPUTE-SANITIZER
0.35
========= ERROR SUMMARY: 0 errors
$
If you wanted to break the compilation and linking steps apart, you could do:
nvcc -c t1901.cu
g++ -c main.cpp
g++ main.o t1901.o -o test -L/usr/local/cuda/lib64 -lcudart
Or in an MPI project, replace the above g++
with mpicxx
After some additional discussion in the comments, contrary to your question title and first sentence, you actually have a C project, not C++ (you are wanting to do the final link with mpicc
which is a C compiler).
In that case we could lay out the above code somewhat differently, and refer to some instructions here for getting all of our C++ linking in order. Here is another example:
$ cat t1902.h
#ifdef __cplusplus
extern "C"
#endif
void C_init(double scale);
#ifdef __cplusplus
extern "C"
#endif
void C_doStuff(int N, double *arg1, double *arg2);
#ifdef __cplusplus
extern "C"
#endif
void C_end();
$ cat t1902.cu
#include <iostream>
#include <vector>
__global__ void my_work(int N, double *X, double *Y, double *scale){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < N)
Y[idx] += X[idx] * (*scale);
}
class stuffHandler
{
double persistent;
double *dev_persistent = NULL;
public:
stuffHandler(double persistent_) : persistent(persistent_) {
cudaMalloc(&dev_persistent, sizeof(double));
cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
void doStuff(int N, double *arg1, double *arg2){
double *d_arg1, *d_arg2;
cudaMalloc(&d_arg1, N*sizeof(double));
cudaMalloc(&d_arg2, N*sizeof(double));
cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_arg1);
cudaFree(d_arg2);}
void finish(){if (dev_persistent) cudaFree(dev_persistent);}
};
stuffHandler *stuff_handler = NULL;
extern "C" void C_doStuff(int N, double *arg1, double *arg2){
if (stuff_handler)
stuff_handler->doStuff(N, arg1, arg2);
}
extern "C" void C_end(){
if (stuff_handler) {
stuff_handler->finish();
delete stuff_handler;}
stuff_handler = NULL;
}
extern "C" void C_init(double scale){
if (stuff_handler) C_end();
stuff_handler = new stuffHandler(scale);
}
$ cat main.c
#include <stdio.h>
#include <stdlib.h>
#include "t1902.h"
int main(){
int i,my_N = 4;
double scale = 1.5;
C_init(scale);
double *d1 = malloc(my_N*sizeof(double));
double *d2 = malloc(my_N*sizeof(double));
for (i=0; i < my_N; i++) {
d1[i] = 0.1;
d2[i] = 0.2;}
C_doStuff(my_N, d1, d2);
printf("%f\n", d2[0]);
C_end();
}
$ nvcc -c t1902.cu
$ gcc -c main.c
$ gcc -o test main.o t1902.o -L/usr/local/cuda/lib64 -lcudart_static -lculibos -lpthread -lrt -ldl -lstdc++
$ compute-sanitizer ./test
========= COMPUTE-SANITIZER
0.350000
========= ERROR SUMMARY: 0 errors
$
In the compilation sequence above, it should be possible to replace gcc
with mpicc
.
Upvotes: 2