interestedparty333
interestedparty333

Reputation: 2536

How to recycle/reuse CUDA threads

In CUDA, how does one create a barrier for all threads in a kernel to wait on, until the CPU sends a signal to that barrier that it's safe/helpful to proceed?

I would like to avoid the overhead of launching a CUDA kernel. There are two types of overhead to avoid: (1) the cost of simply launching the kernel on X blocks and Y threads, and (2) the time it takes me to reinitialize my shared memory, which will largely have the same contents between invocations.

We do recycle/re-use threads all the time in CPU workloads. And CUDA even provides event synchronization primitives. Perhaps it would be minimal hardware cost to provide a more traditional signaling object.

Here's some code that provides a hole for the concept that I seek. The reader will probably want to search for QUESTION IS HERE. Building it in Nsight requires setting the Device Linker Mode to Separate Compilation (at least, I found it necessary).

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int COUNT_DOWN_ITERATIONS = 1000;
const int KERNEL_MAXIMUM_LOOPS = 5; // IRL, we'd set this large enough to prevent hitting this value, unless the kernel is externally terminated
const int SIGNALS_TO_SEND_COUNT = 3;
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 2;

__device__ void count_down(int * shared_location_to_ensure_side_effect) {
    int x = *shared_location_to_ensure_side_effect;
    for (int i = 0; i < COUNT_DOWN_ITERATIONS; ++i) {
        x += i;
    }
    *shared_location_to_ensure_side_effect = x;
}

/**
 * CUDA kernel waits for events and then counts down upon receiving them.
 */
__global__ void kernel(cudaStream_t stream, cudaEvent_t go_event, cudaEvent_t done_event, int ** cuda_malloc_managed_int_address) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        printf("Block %i, Thread %i: entered loop; waitin 4 go_event\n", blockIdx.x, threadIdx.x);

        // QUESTION IS HERE: I want this to block on receiving a signal from the
        // CPU, indicating that work is ready to be done
        cudaStreamWaitEvent(stream, go_event, cudaEventBlockingSync);

        printf("Block %i, Thread %i:      in loop; received go_event\n", blockIdx.x, threadIdx.x);
        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);
        printf("Block %i, Thread %i:      finished counting\n", blockIdx.x, threadIdx.x);
        cudaEventRecord(done_event, stream);
        printf("Block %i, Thread %i:      recorded event; may loop back\n", blockIdx.x, threadIdx.x);
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}


int main(void)
{

    int ** data;
    cudaMallocManaged(&data, BLOCK_COUNT * sizeof(int *));
    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaMallocManaged(&(data[b]), THREADS_PER_BLOCK * sizeof(int));

    cudaEvent_t go_event;
    cudaEventCreateWithFlags(&go_event, cudaEventBlockingSync);

    cudaEvent_t done_event;
    cudaEventCreateWithFlags(&done_event, cudaEventBlockingSync);

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());  // probably unnecessary

    printf("CPU: spawning kernel\n");
    kernel<<<BLOCK_COUNT, THREADS_PER_BLOCK, sizeof(int), stream>>>(stream, go_event, done_event, data);


    for (int i = 0; i < SIGNALS_TO_SEND_COUNT; ++i) {
        usleep(4 * 1000 * 1000); // accepts time in microseconds

        // Simulate the sending of the "next" piece of work
        data[0][0] = i;      // unrolled, because it's easier to read
        data[0][1] = i + 1;  // unrolled, because it's easier to read

        printf("CPU: sending go_event\n");
        cudaEventRecord(go_event, stream);
        cudaStreamWaitEvent(stream, done_event, cudaEventBlockingSync); // doesn't block even though I wish it would
    }

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    for (int b = 0; b < BLOCK_COUNT; ++b) {
        for (int t = 0; t < THREADS_PER_BLOCK; ++t) {
            printf("Result for Block %i and Thread %i: %i\n", b, t, data[b][t]);
        }
    }

    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaFree(data[b]);
    cudaFree(data);

    cudaEventDestroy(done_event);
    cudaEventDestroy(go_event);
    cudaStreamDestroy(stream);

    printf("CPU: exiting program");

    return 0;
}

/**
 * Check the return value of the CUDA runtime API call and exit
 * the application if the call has failed.
 */
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

And here is the output from running it. Note that the outputs are "wrong", simply because they were over-written by the loop whose signal is supposed to be the blocking mechanism for the GPU threads.

CPU: spawning kernel
Block 0, Thread 0: entered kernel
Block 0, Thread 1: entered kernel
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: copying result 2497500 back to managed memory
Block 0, Thread 1: copying result 2497500 back to managed memory
Block 0, Thread 0: exiting kernel
Block 0, Thread 1: exiting kernel
CPU: sending go_event
CPU: sending go_event
CPU: sending go_event
Result for Block 0 and Thread 0: 2
Result for Block 0 and Thread 1: 3
CPU: exiting program

Upvotes: 2

Views: 1095

Answers (2)

interestedparty333
interestedparty333

Reputation: 2536

READ THE OTHER ANSWER FIRST. This answer is only still here for historical reference. I'll either downvote it or delete it soon.

One possible implementation is to have a set of flags or integer in the device memory. The CUDA threads would block (perhaps by calling clock64()) until the flag/integer reaches a certain value, indicating that there is more work for the CUDA thread to process. This would probably be slower than using a first-class CUDA-provided synchronization primitive but faster than reinitializing my __shared__ memory with each kernel invocation. It also involves some sort of busy wait/sleep mechanism, which I'm not thrilled about.

Follow-up: It appears to be working -- some of the time (printf calls seem to help). I'm guessing there's some undefined behavior in managed memory that's benefiting me. Here's the code:

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int COUNT_DOWN_ITERATIONS = 1000;
const int KERNEL_MAXIMUM_LOOPS = 5; // IRL, we'd set this large enough to prevent hitting this value, unless the kernel is externally terminated
const int SIGNALS_TO_SEND_COUNT = 3;
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 2;

__device__ void count_down(int * shared_location_to_ensure_side_effect) {
    int x = *shared_location_to_ensure_side_effect;
    for (int i = 0; i < COUNT_DOWN_ITERATIONS; ++i) {
        x += i;
    }
    *shared_location_to_ensure_side_effect = x;
}


__device__ void clock_block(clock_t clock_count)
{
    //printf("time used so far: %lu\n", clock64());
    clock_t start_clock = clock64();
    while (clock64() - start_clock < clock_count);
}

/**
 * CUDA kernel waits for flag to increment and then counts down.
 */
__global__ void kernel_block_via_flag(cudaStream_t stream, cudaEvent_t go_event, cudaEvent_t done_event, int ** cuda_malloc_managed_int_address, int * cuda_malloc_managed_synchronization_flag) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        printf("Block %i, Thread %i: entered loop; waitin 4 go_event\n", blockIdx.x, threadIdx.x);
        while (*cuda_malloc_managed_synchronization_flag <= i)

            //printf("%lu\n", *cuda_malloc_managed_synchronization_flag);
            clock_block(1000000000); // in cycles, not seconds!

        cudaStreamWaitEvent(stream, go_event, cudaEventBlockingSync);
        printf("Block %i, Thread %i:      in loop; received go_event\n", blockIdx.x, threadIdx.x);

        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);
        printf("Block %i, Thread %i:      finished counting\n", blockIdx.x, threadIdx.x);
        cudaEventRecord(done_event, stream);
        printf("Block %i, Thread %i:      recorded event; may loop back\n", blockIdx.x, threadIdx.x);
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}


int main(void)
{

    int ** data;
    cudaMallocManaged(&data, BLOCK_COUNT * sizeof(int *));
    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaMallocManaged(&(data[b]), THREADS_PER_BLOCK * sizeof(int));

    cudaEvent_t go_event;
    cudaEventCreateWithFlags(&go_event, cudaEventBlockingSync);

    cudaEvent_t done_event;
    cudaEventCreateWithFlags(&done_event, cudaEventBlockingSync);

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    int * synchronization_flag;
    cudaMallocManaged(&synchronization_flag, sizeof(int));
    //cudaMalloc(&synchronization_flag, sizeof(int));
    //int my_copy_of_synchronization_flag = 0;

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());  // probably unnecessary

    printf("CPU: spawning kernel\n");
    kernel_block_via_flag<<<BLOCK_COUNT, THREADS_PER_BLOCK, sizeof(int), stream>>>(stream, go_event, done_event, data, synchronization_flag);
    CUDA_CHECK_RETURN(cudaMemAdvise(synchronization_flag, sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));

    for (int i = 0; i < SIGNALS_TO_SEND_COUNT; ++i) {
        usleep(4 * 1000 * 1000); // accepts time in microseconds

        // Simulate the sending of the "next" piece of work
        data[0][0] = i;      // unrolled, because it's easier to read
        data[0][1] = i + 1;  // unrolled, because it's easier to read

        printf("CPU: sending go_event\n");
        //++my_copy_of_synchronization_flag;
        //CUDA_CHECK_RETURN(cudaMemcpyAsync(synchronization_flag, &my_copy_of_synchronization_flag, sizeof(int), cudaMemcpyHostToDevice));
        *synchronization_flag = *synchronization_flag + 1; // since it's monotonically increasing, and only written to by the CPU code, this is fine
    }

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    for (int b = 0; b < BLOCK_COUNT; ++b) {
        for (int t = 0; t < THREADS_PER_BLOCK; ++t) {
            printf("Result for Block %i and Thread %i: %i\n", b, t, data[b][t]);
        }
    }

    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaFree(data[b]);
    cudaFree(data);
    cudaFree(synchronization_flag);

    cudaEventDestroy(done_event);
    cudaEventDestroy(go_event);
    cudaStreamDestroy(stream);

    printf("CPU: exiting program");

    return 0;
}

/**
 * Check the return value of the CUDA runtime API call and exit
 * the application if the call has failed.
 */
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}




__global__ void kernel_block_via_flag(cudaStream_t stream, cudaEvent_t go_event, cudaEvent_t done_event, int ** cuda_malloc_managed_int_address, int * cuda_malloc_managed_synchronization_flag) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        printf("Block %i, Thread %i: entered loop; waitin 4 go_event\n", blockIdx.x, threadIdx.x);
        while (*cuda_malloc_managed_synchronization_flag <= i)
            //printf("%i\n", *cuda_malloc_managed_synchronization_flag);
            clock_block(1000000000);

        cudaStreamWaitEvent(stream, go_event, cudaEventBlockingSync);
        printf("Block %i, Thread %i:      in loop; received go_event\n", blockIdx.x, threadIdx.x);

        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);
        printf("Block %i, Thread %i:      finished counting\n", blockIdx.x, threadIdx.x);
        cudaEventRecord(done_event, stream);
        printf("Block %i, Thread %i:      recorded event; may loop back\n", blockIdx.x, threadIdx.x);
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}

And the output:

CPU: spawning kernel
Block 0, Thread 0: entered kernel
Block 0, Thread 1: entered kernel
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
CPU: sending go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
CPU: sending go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
CPU: sending go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event

This is still a bad solution. I hope to accept someone else's answer.

Upvotes: 0

interestedparty333
interestedparty333

Reputation: 2536

Read this answer. I plan to delete the first one after reaching a consensus, as I hope its only value will be historical.

One possible implementation is to have a set of flags or integer in the device memory. The CUDA threads would block (e.g., by calling clock64()) until the flag/integer reaches a certain value, indicating that there is more work for the CUDA thread to process. This would probably be slower than using a first-class CUDA-provided synchronization primitive but faster than reinitializing my shared memory with each kernel invocation. It also involves some sort of busy wait/sleep mechanism, which I'm not thrilled about.

Here's an implementation that appears to be working -- though, I'm concerned that I'm relying upon some undefined behavior of managed memory that happens to benefit the program's execution. Here's the code:

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>

#include <chrono>
#include <thread>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int COUNT_DOWN_ITERATIONS = 1000;
const int KERNEL_MAXIMUM_LOOPS = 1000; // IRL, we'd set this large enough to prevent hitting this value, unless the kernel is externally terminated
const int SIGNALS_TO_SEND_COUNT = 1000;
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 2;

__device__ void count_down(int * shared_location_to_ensure_side_effect) {
    int x = *shared_location_to_ensure_side_effect;
    for (int i = 0; i < COUNT_DOWN_ITERATIONS; ++i) {
        x += i;
    }
    *shared_location_to_ensure_side_effect = x;
}


__device__ void clock_block(clock_t clock_count)
{
    clock_t start_clock = clock64();
    while (clock64() - start_clock < clock_count);
}

/**
 * CUDA kernel waits for flag to increment and then counts down.
 */
__global__ void spawn_worker_threads(int ** cuda_malloc_managed_int_address, int * cuda_malloc_managed_go_flag, int * cuda_malloc_managed_done_flag) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    volatile int * my_go_flag = cuda_malloc_managed_go_flag;
    volatile int * volatile_done_flag = cuda_malloc_managed_done_flag;

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        while (*my_go_flag <= i) {
            clock_block(10000); // in cycles, not seconds!
        }

        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);

        // Wait for all worker threads to finish and then signal readiness for new work
        __syncthreads(); // TODO: sync with other blocks too

        if (blockIdx.x == 0 && threadIdx.x == 0)
            *volatile_done_flag  = *volatile_done_flag + 1;
        //__threadfence_system(); // based on the documentation, it's not clear that this should actually help
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}


int main(void)
{

    int ** data;
    cudaMallocManaged(&data, BLOCK_COUNT * sizeof(int *));
    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaMallocManaged(&(data[b]), THREADS_PER_BLOCK * sizeof(int));

    int * go_flag;
    int * done_flag;
    cudaMallocManaged(&go_flag, sizeof(int));
    cudaMallocManaged(&done_flag, sizeof(int));

    volatile int * my_volatile_done_flag = done_flag;

    printf("CPU: spawning kernel\n");
    spawn_worker_threads<<<BLOCK_COUNT, THREADS_PER_BLOCK>>>(data, go_flag, done_flag);

    // The cudaMemAdvise calls seem to be unnecessary, but they make it ~13% faster
    CUDA_CHECK_RETURN(cudaMemAdvise(go_flag, sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CUDA_CHECK_RETURN(cudaMemAdvise(done_flag, sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));


    for (int i = 0; i < SIGNALS_TO_SEND_COUNT; ++i) {
        if (i % 50 == 0) printf("============== CPU: On iteration %i ============\n", i);

        // Simulate the writing of the "next" piece of work
        data[0][0] = i;      // unrolled, because it's easier to read this way
        data[0][1] = i + 1;  // unrolled, because it's easier to read

        *go_flag = *go_flag + 1; // since it's monotonically increasing, and only written to by the CPU code, this is fine

        while (*my_volatile_done_flag < i)
            std::this_thread::sleep_for(std::chrono::microseconds(50));
    }
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());

    for (int b = 0; b < BLOCK_COUNT; ++b)
        for (int t = 0; t < THREADS_PER_BLOCK; ++t)
            printf("Result for Block %i and Thread %i: %i\n", b, t, data[b][t]);

    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaFree(data[b]);
    cudaFree(data);
    cudaFree(go_flag);
    cudaFree(done_flag);

    printf("CPU: exiting program");

    return 0;
}

/**
 * Check the return value of the CUDA runtime API call and exit
 * the application if the call has failed.
 */
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

And here's the output, which talks about 50ms to generate. That's about 50 microseconds per "recycling" which is well-within my real application's tolerance.

Starting timer for Synchronization timer
CPU: spawning kernel
============== CPU: On iteration 0 ============
============== CPU: On iteration 50 ============
============== CPU: On iteration 100 ============
============== CPU: On iteration 150 ============
============== CPU: On iteration 200 ============
============== CPU: On iteration 250 ============
============== CPU: On iteration 300 ============
============== CPU: On iteration 350 ============
============== CPU: On iteration 400 ============
============== CPU: On iteration 450 ============
============== CPU: On iteration 500 ============
============== CPU: On iteration 550 ============
============== CPU: On iteration 600 ============
============== CPU: On iteration 650 ============
============== CPU: On iteration 700 ============
============== CPU: On iteration 750 ============
============== CPU: On iteration 800 ============
============== CPU: On iteration 850 ============
============== CPU: On iteration 900 ============
============== CPU: On iteration 950 ============
Block 0, Thread 0: entered kernel
Block 0, Thread 1: entered kernel
Block 0, Thread 0: copying result 499500001 back to managed memory
Block 0, Thread 1: copying result 499500001 back to managed memory
Block 0, Thread 0: exiting kernel
Block 0, Thread 1: exiting kernel
Result for Block 0 and Thread 0: 499500001
Result for Block 0 and Thread 1: 499500001
CPU: exiting program

Thanks to @einpoklum and @robertcrovella for suggesting the use of volatile. It seems to be working, but I'm inexperienced with volatile. Based on what I've read, this is a valid and correct usage that should result in defined behavior. Would y'all mind please confirming or correcting this conclusion?

Upvotes: 0

Related Questions