cuda programming with pthread

Question

#include 
#include 
#include 

#define ARR_SIZE    10
#define NUM_DEVICE  1

typedef struct {
   int *arr;
   int *dev_arr;
   int *dev_result;
   int *result;
   int num;
} cuda_st;

__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
    int idx = threadIdx.x;
    printf("dev_arr[%d] = %d
", idx, dev_arr[idx]);
    atomicAdd(dev_result, dev_arr[idx]);
}

void *thread_func(void* struc)
{
    cuda_st * data = (cuda_st*)struc;
    printf("thread %d func start
", data->num);
    printf("arr %d = ", data->num);
    for(int i=0; i<10; i++) {
        printf("%d ", data->arr[i]);
    }
    printf("
");
    cudaSetDevice(data->num);
    cudaMemcpy(data->dev_arr, data->arr,  sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
    kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
    cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    printf("thread %d func exit
", data->num);
    return NULL;
}

int main(void)
{
    // Make object
    cuda_st cuda[NUM_DEVICE];

    // Make thread
    pthread_t pthread[NUM_DEVICE];

    // Host array memory allocation
    int *arr[NUM_DEVICE];
    for(int i=0; i



I make my simple-test-program like this to test pthread with multi device cuda code.

When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.

I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.

I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.

So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.

Do I have to use asynchonous function in cuda functions and kernels?

Give me some advise.

vinograd47 · Accepted Answer

The problem is that you allocate memory on one device. You need to call cudaSetDevice before cudaMalloc calls:

// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i

cuda programming with pthread

Answers (1)

Related Questions