CUDA CPU-GPU callbacks using asynchronous memory transfer

Question

Fellow Cuda Programmers,

I'm trying to implement a cpu-gpu callback mechanism using polling mechanism. I've 2 arrays of length 1 (a and cpuflag, corresponding on device side dev_a and gpuflag)(basically 2 variables).

First CPU clears a and waits for update of gpuflag. GPU sees this clearing of a and then updates gpuflag. CPU asynchronously keeps transfering gpuflag to cpuflag and waits for update in the flag. Once CPU sees the update, it again resets a and asynchronously sends it to gpu. Again GPU sees this clearing of a and updates gpuflag and the ping-pong process continues. I want this process to continue for 100 times.

The whole code is here. You can compile it just by saying nvcc -o output filename.cu I'm not able to understand why the code is not exhibiting ping-pong behavior. Any kind of help is very much appreciated. Thanks in advance.

#include 

#define LEN 1
#define MAX 100

__global__ void myKernel(int len, int *dev_a, int *gpuflag) {
        int tid = threadIdx.x;
        gpuflag[tid] = 0;

        while(true){
        //Check if cpu has completed work
                if(dev_a[tid] == 0){
            //Do gpu work and increment flag
                        dev_a[tid] = 1;
                        gpuflag[tid]++;

            //Wait till cpu detects the flag increment and resets
                        while(true){
                                if(dev_a[tid] == 0){
                                        break;
                                }
                        }
                }
        //Max 100 ping pongs
        if(gpuflag[tid]==MAX){
            break;
        }
        }
}

int main( void ) {
        int index, *cpuflag, *gpuflag, value;

        int *a;
        int *dev_a;

        cudaStream_t stream0, stream1;

        cudaStreamCreate( &stream0 );
        cudaStreamCreate( &stream1 );

        cudaMalloc ( (void**)&gpuflag, LEN*sizeof(int) );
        cudaMemset ( gpuflag, 0, LEN*sizeof(int) );
        cudaHostAlloc( (void**)&cpuflag, LEN*sizeof(int), cudaHostAllocDefault );

        cudaMalloc ( (void**)&dev_a, LEN*sizeof(int) );
        cudaMemset ( dev_a, 0, LEN*sizeof(int) );
        cudaHostAlloc( (void**)&a, LEN*sizeof(int), cudaHostAllocDefault );

    //Reset everything
        for(int i=0; i>>(LEN, dev_a, gpuflag);

        while(true){
        //Asynchronously copy gpu flag
                cudaMemcpyAsync(cpuflag, gpuflag, LEN*sizeof(int), cudaMemcpyDeviceToHost, stream1);
        //Check if increment has happened or not
                if(cpuflag[index] == value){
            //if yes, reset 
                for(int i=0; i

CUDA CPU-GPU callbacks using asynchronous memory transfer

Answers (1)

Related Questions