Umbrella
Umbrella

Reputation: 505

cuda programming with pthread

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

#define ARR_SIZE    10
#define NUM_DEVICE  1

typedef struct {
   int *arr;
   int *dev_arr;
   int *dev_result;
   int *result;
   int num;
} cuda_st;

__global__ void kernel_fc(int *dev_arr, int *dev_result)
{
    int idx = threadIdx.x;
    printf("dev_arr[%d] = %d\n", idx, dev_arr[idx]);
    atomicAdd(dev_result, dev_arr[idx]);
}

void *thread_func(void* struc)
{
    cuda_st * data = (cuda_st*)struc;
    printf("thread %d func start\n", data->num);
    printf("arr %d = ", data->num);
    for(int i=0; i<10; i++) {
        printf("%d ", data->arr[i]);
    }
    printf("\n");
    cudaSetDevice(data->num);
    cudaMemcpy(data->dev_arr, data->arr,  sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
    kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
    cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    printf("thread %d func exit\n", data->num);
    return NULL;
}

int main(void)
{
    // Make object
    cuda_st cuda[NUM_DEVICE];

    // Make thread
    pthread_t pthread[NUM_DEVICE];

    // Host array memory allocation
    int *arr[NUM_DEVICE];
    for(int i=0; i<NUM_DEVICE; i++) {
        arr[i] = (int*)malloc(sizeof(int)*ARR_SIZE);
    }

    // Fill this host array up with specified data
    for(int i=0; i<NUM_DEVICE; i++) {
        for(int j=0; j<ARR_SIZE; j++) {
            arr[i][j] = i*ARR_SIZE+j;
        }
    }

    // To confirm host array data
    for(int i=0; i<NUM_DEVICE; i++) {
        printf("arr[%d] = ", i);
        for(int j=0; j<ARR_SIZE; j++) {
            printf("%d ", arr[i][j]);
        }
        printf("\n");
    }

    // Result memory allocation
    int *result[NUM_DEVICE];
    for(int i=0; i<NUM_DEVICE; i++) {
        result[i] = (int*)malloc(sizeof(int));
        memset(result[i], 0, sizeof(int));
    }

    // Device array memory allocation
    int *dev_arr[NUM_DEVICE];
    for(int i=0; i<NUM_DEVICE; i++) {
        cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
    }

    // Device result memory allocation
    int *dev_result[NUM_DEVICE];
    for(int i=0; i<NUM_DEVICE; i++) {
        cudaMalloc(&dev_result[i], sizeof(int));
        cudaMemset(dev_result[i], 0, sizeof(int));
    }

    // Connect these pointers with object
    for(int i=0; i<NUM_DEVICE; i++) {
        cuda[i].arr = arr[i];
        cuda[i].dev_arr = dev_arr[i];
        cuda[i].result = result[i];
        cuda[i].dev_result = dev_result[i];
        cuda[i].num = i;
     }

    // Create and excute pthread
    for(int i=0; i<NUM_DEVICE; i++) {
        pthread_create(&pthread[i], NULL, thread_func, (void*)&cuda[i]);
    }

    // Join pthread
    for(int i=0; i<NUM_DEVICE; i++) {
        pthread_join(pthread[i], NULL);
    }

    for(int i=0; i<NUM_DEVICE; i++) {
        printf("result[%d] = %d\n", i, (*cuda[i].result));
    }

    return 0;
}

I make my simple-test-program like this to test pthread with multi device cuda code.

When the NUM_DEVICE set as 1, it works well but when set as 2 program stopped.

I guess beacause multiple threads access cudaSetDevice but I don't know how to handle this.

I tried to make my program with single host thread and multi device(with Async function) before, but in my case(not above simple code), there are many host code between kernel functions so it doesn't work well asynchronously.

So I test to use multi thread on host before apply this manner to my real code but I have trouble like this.

Do I have to use asynchonous function in cuda functions and kernels?

Give me some advise.

Upvotes: 1

Views: 2876

Answers (1)

vinograd47
vinograd47

Reputation: 6420

The problem is that you allocate memory on one device. You need to call cudaSetDevice before cudaMalloc calls:

// Device array memory allocation
int *dev_arr[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
    cudaSetDevice(i);
    cudaMalloc(&dev_arr[i], sizeof(int)*ARR_SIZE);
}

// Device result memory allocation
int *dev_result[NUM_DEVICE];
for(int i=0; i<NUM_DEVICE; i++) {
    cudaSetDevice(i);
    cudaMalloc(&dev_result[i], sizeof(int));
    cudaMemset(dev_result[i], 0, sizeof(int));
}

Upvotes: 3

Related Questions