Reputation: 505
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define ARR_SIZE 10
#define NUM_DEVICE 1
typedef struct {
int *arr;
int *dev_arr;
int *dev_result;
int *result;
int num;
} cuda_st;
__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
int idx = threadIdx.x;
printf("dev_arr[%d] = %d\n", idx, dev_arr[idx]);
atomicAdd(dev_result, dev_arr[idx]);
}
void *thread_func(void* struc)
{
cuda_st * data = (cuda_st*)struc;
printf("thread %d func start\n", data->num);
printf("arr %d = ", data->num);
for(int i=0; i<10; i++) {
printf("%d ", data->arr[i]);
}
printf("\n");
cudaSetDevice(data->num);
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("thread %d func exit\n", data->num);
return NULL;
}
int main(void)
{
// Make object
cuda_st cuda[NUM_DEVICE];
// Make thread
pthread_t pthread[NUM_DEVICE];
// Host array memory allocation
int *arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
arr[i] = (int*)malloc(sizeof(int)*ARR_SIZE);
}
// Fill this host array up with specified data
for(int i=0; i<NUM_DEVICE; i++) {
for(int j=0; j<ARR_SIZE; j++) {
arr[i][j] = i*ARR_SIZE+j;
}
}
// To confirm host array data
for(int i=0; i<NUM_DEVICE; i++) {
printf("arr[%d] = ", i);
for(int j=0; j<ARR_SIZE; j++) {
printf("%d ", arr[i][j]);
}
printf("\n");
}
// Result memory allocation
int *result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
result[i] = (int*)malloc(sizeof(int));
memset(result[i], 0, sizeof(int));
}
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}
// Connect these pointers with object
for(int i=0; i<NUM_DEVICE; i++) {
cuda[i].arr = arr[i];
cuda[i].dev_arr = dev_arr[i];
cuda[i].result = result[i];
cuda[i].dev_result = dev_result[i];
cuda[i].num = i;
}
// Create and excute pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_create(&pthread[i], NULL, thread_func, (void*)&cuda[i]);
}
// Join pthread
for(int i=0; i<NUM_DEVICE; i++) {
pthread_join(pthread[i], NULL);
}
for(int i=0; i<NUM_DEVICE; i++) {
printf("result[%d] = %d\n", i, (*cuda[i].result));
}
return 0;
}
I make my simple-test-program like this to test pthread with multi device cuda code.
When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.
I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.
I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.
So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.
Do I have to use asynchonous function in cuda functions and kernels?
Give me some advise.
Upvotes: 1
Views: 2876
Reputation: 6420
The problem is that you allocate memory on one device. You need to call cudaSetDevice
before cudaMalloc
calls:
// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}
// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
cudaSetDevice(i);
cudaMalloc(&dev_result[i], sizeof(int));
cudaMemset(dev_result[i], 0, sizeof(int));
}
Upvotes: 3