David
David

Reputation: 337

using std::thread and CUDA together

I'm looking for an quick example of using std::thread and CUDA together. When using mutiple host thread, does it require each host thread to be assigned a certain number of GPU threads that's not overlapping with each other?

Upvotes: 0

Views: 1937

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 152174

You can use std::thread and CUDA together.

There is no particular arrangement required for the association between threads and GPUs. You can have 1 thread manage all GPUs, one per GPU, 4 per GPU, all threads talk to all GPUs, or whatever you like. (There is no relationship whatsoever between GPU threads and host threads, assuming by GPU threads you mean GPU threads in device code. )

Libraries like CUFFT and CUBLAS may have certain expectations about handle usage, typically that you must not share a handle between threads, and handles are inherently device-specific.

Here's a worked example demonstrating 4 threads (one per GPU) followed by one thread dispatching work to all 4 GPUs:

$ cat t1457.cu
#include <thread>
#include <vector>
#include <iostream>
#include <cstdio>

__global__ void k(int n){
  printf("hello from thread %d\n", n);
}

void thread_func(int n){

  if (n >= 0){
    cudaSetDevice(n);
    k<<<1,1>>>(n);
    cudaDeviceSynchronize();}
  else{
    cudaError_t err = cudaGetDeviceCount(&n);
    for (int i = 0; i < n; i++){
      cudaSetDevice(i);
      k<<<1,1>>>(-1);}
    for (int i = 0; i <n; i++){
      cudaSetDevice(i);
      cudaDeviceSynchronize();}}
}

int main(){

  int n = 0;
  cudaError_t err = cudaGetDeviceCount(&n);
  if (err != cudaSuccess) {std::cout << "error " << (int)err << std::endl; return 0;}
  std::vector<std::thread> t;
  for (int i = 0; i < n; i++)
    t.push_back(std::thread(thread_func, i));
  std::cout << n << " threads started" << std::endl;
  for (int i = 0; i < n; i++)
    t[i].join();
  std::cout << "join finished" << std::endl;
  std::thread ta(thread_func, -1);
  ta.join();
  std::cout << "finished" << std::endl;
  return 0;
}
$ nvcc -o t1457 t1457.cu -std=c++11
$ ./t1457
4 threads started
hello from thread 1
hello from thread 3
hello from thread 2
hello from thread 0
join finished
hello from thread -1
hello from thread -1
hello from thread -1
hello from thread -1
finished
$

Here's an example showing 4 threads issuing work to a single GPU:

$ cat t1459.cu
#include <thread>
#include <vector>
#include <iostream>
#include <cstdio>

__global__ void k(int n){
  printf("hello from thread %d\n", n);
}

void thread_func(int n){

    cudaSetDevice(0);
    k<<<1,1>>>(n);
    cudaDeviceSynchronize();
}

int main(){

  const int n = 4;
  std::vector<std::thread> t;
  for (int i = 0; i < n; i++)
    t.push_back(std::thread(thread_func, i));
  std::cout << n << " threads started" << std::endl;
  for (int i = 0; i < n; i++)
    t[i].join();
  std::cout << "join finished" << std::endl;
  return 0;
}
$ nvcc t1459.cu -o t1459 -std=c++11
$ ./t1459
4 threads started
hello from thread 0
hello from thread 1
hello from thread 3
hello from thread 2
join finished
$

Upvotes: 1

Related Questions