Reputation: 63
Fellow Cuda Programmers,
I'm trying to implement a cpu-gpu callback mechanism using polling mechanism. I've 2 arrays of length 1 (a and cpuflag, corresponding on device side dev_a and gpuflag)(basically 2 variables).
First CPU clears a and waits for update of gpuflag. GPU sees this clearing of a and then updates gpuflag. CPU asynchronously keeps transfering gpuflag to cpuflag and waits for update in the flag. Once CPU sees the update, it again resets a and asynchronously sends it to gpu. Again GPU sees this clearing of a and updates gpuflag and the ping-pong process continues. I want this process to continue for 100 times.
The whole code is here. You can compile it just by saying nvcc -o output filename.cu I'm not able to understand why the code is not exhibiting ping-pong behavior. Any kind of help is very much appreciated. Thanks in advance.
#include <stdio.h>
#define LEN 1
#define MAX 100
__global__ void myKernel(int len, int *dev_a, int *gpuflag) {
int tid = threadIdx.x;
gpuflag[tid] = 0;
while(true){
//Check if cpu has completed work
if(dev_a[tid] == 0){
//Do gpu work and increment flag
dev_a[tid] = 1;
gpuflag[tid]++;
//Wait till cpu detects the flag increment and resets
while(true){
if(dev_a[tid] == 0){
break;
}
}
}
//Max 100 ping pongs
if(gpuflag[tid]==MAX){
break;
}
}
}
int main( void ) {
int index, *cpuflag, *gpuflag, value;
int *a;
int *dev_a;
cudaStream_t stream0, stream1;
cudaStreamCreate( &stream0 );
cudaStreamCreate( &stream1 );
cudaMalloc ( (void**)&gpuflag, LEN*sizeof(int) );
cudaMemset ( gpuflag, 0, LEN*sizeof(int) );
cudaHostAlloc( (void**)&cpuflag, LEN*sizeof(int), cudaHostAllocDefault );
cudaMalloc ( (void**)&dev_a, LEN*sizeof(int) );
cudaMemset ( dev_a, 0, LEN*sizeof(int) );
cudaHostAlloc( (void**)&a, LEN*sizeof(int), cudaHostAllocDefault );
//Reset everything
for(int i=0; i<LEN; i++)
a[i] = 0;
//Auxillary variables
index = 0;
value = 1;
//call kernel
myKernel<<<1,1,0,stream0>>>(LEN, dev_a, gpuflag);
while(true){
//Asynchronously copy gpu flag
cudaMemcpyAsync(cpuflag, gpuflag, LEN*sizeof(int), cudaMemcpyDeviceToHost, stream1);
//Check if increment has happened or not
if(cpuflag[index] == value){
//if yes, reset
for(int i=0; i<LEN; i++)
a[i] = 0;
//transfer asynchronously
cudaMemcpyAsync(dev_a, a, LEN*sizeof(int), cudaMemcpyHostToDevice, stream1);
//increment pattern
value++;
printf("GPU updated once. Value is a[%d] = %d, cpuflag = %d\n", index, a[index], cpuflag[index]);
} else {
printf("------------GPU didn't updated. Value is a[%d] = %d, cpuflag = %d\n", index, a[index], cpuflag[index]);
}
//Max 100 ping-pongs
if(value == MAX){
break;
}
}
cudaFreeHost(a);
cudaFreeHost(cpuflag);
cudaFree(dev_a);
cudaFree(gpuflag);
cudaStreamDestroy( stream0 );
cudaStreamDestroy( stream1 );
return 0;
}
Upvotes: 0
Views: 1178
Reputation: 152269
Probably the main thing missing is appropriate use of volatile
.
Here's a simplified, fully worked example:
$ cat t763.cu
#include <stdio.h>
#define LEN 1
#define MAX 100
#define DLEN 1000
#define nTPB 256
#ifdef CDP_WORKER
__global__ void cdp_worker(int len, float *data){
int tid = threadIdx.x+blockDim.x*blockIdx.x;
if (tid < len) data[tid]++; // simple increment
}
#endif
// only call this kernel with 1 thread
__global__ void myKernel(int len, int dlen, volatile int *dev_a, int *gpuflag, float *data) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while(gpuflag[tid] < MAX){
//Check if cpu has completed work
if(dev_a[tid] == 0){
//Do gpu work and increment flag
#ifdef CDP_WORKER
cdp_worker<<<(dlen+nTPB-1)/nTPB, nTPB>>>(dlen, data);
cudaDeviceSynchronize();
#endif
dev_a[tid] = 1;
gpuflag[tid]++;
}
}
}
void issue_work(int value, float *h_data, float *d_data, int len, cudaStream_t mystream){
#ifdef CDP_WORKER
cudaMemcpyAsync(h_data, d_data, len*sizeof(float), cudaMemcpyDeviceToHost, mystream);
cudaStreamSynchronize(mystream);
for (int i = 0; i < len; i++) if (h_data[i] != value+1) {printf("fault - was %f, should be %f\n", h_data[i], (float)(value+1)); break;}
cudaMemcpyAsync(d_data, h_data, len*sizeof(float), cudaMemcpyHostToDevice, mystream); // technically not really necessary
cudaStreamSynchronize(mystream);
#endif
return;
}
int main( void ) {
int *gpuflag, value;
float *h_data, *d_data;
cudaHostAlloc(&h_data, DLEN*sizeof(float), cudaHostAllocDefault);
cudaMalloc(&d_data, DLEN*sizeof(float));
volatile int *z_a;
cudaStream_t stream0, stream1;
cudaStreamCreate( &stream0 );
cudaStreamCreate( &stream1 );
cudaMalloc ( (void**)&gpuflag, LEN*sizeof(int) );
cudaMemset ( gpuflag, 0, LEN*sizeof(int) );
cudaMemset ( d_data, 0, DLEN*sizeof(float));
cudaHostAlloc( (void**)&z_a, LEN*sizeof(int), cudaHostAllocMapped );
for (int i = 0; i < LEN; i++) z_a[i] =
value = 0;
//call kernel
myKernel<<<1,1,0,stream0>>>(LEN, DLEN, z_a, gpuflag, d_data);
while(value<MAX){
if (z_a[0] == 1) {
issue_work(value, h_data, d_data, DLEN, stream1);
z_a[0] = 0;
printf("%d", value%10);
value++;}
}
printf("\n");
return 0;
}
$ nvcc -o t763 t763.cu
$ cuda-memcheck ./t763
========= CUDA-MEMCHECK
0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
========= ERROR SUMMARY: 0 errors
$ nvcc -DCDP_WORKER -arch=sm_35 -rdc=true t763.cu -o t763 -lcudadevrt
$ cuda-memcheck ./t763
========= CUDA-MEMCHECK
0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
========= ERROR SUMMARY: 0 errors
$
Extending this to work on multiple threads in the same warp is not a trivial matter.
However, I've extended the basic example to demonstrate, on a cc3.5+ device, that the parent kernel can be the supervisory kernel, and it can launch work via child kernels. This is accomplished by compiling with the CDP_WORKER
switch and the additional switches needed for CUDA Dynamic Parallelism, and by running on a cc3.5+ device.
Upvotes: 2