shayito
shayito

Reputation: 11

0xC0000005: Access violation reading location 0x0000000000000018 at cudaEventDestroy()

When I am running a CUDA program, I met "0xC0000005: Access violation reading location 0x0000000000000018." at cudaEventDestroy().

It is confusing because this exception sometimes appears and sometimes not (if not, the program runs normally without any error).

start & stop is defined as cudaEvent_t start, stop; to compute time elapsed on GPU.

Exception Thrown

Sometimes the Exception looks like: (at a different location)

enter image description here

When runs normally

Here I attach the whole program code, in which 2 matrices A & B add together to get S on both GPU and CPU, and compare the results.

I am new to CUDA, I will appreciate it a lot if somebody may explain this to me.

Note: the original code is provided by book "professional CUDA C programming".

#include <cmath>
#include <iostream>
#include "common.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


void initialData(float *ip, const int size)
{
    for(int i = 0; i < size; i++)
    {
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
    }
}

void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
    float* ia = A;
    float* ib = B;
    float* ic = C;

    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix = 0; ix < nx; ix++)
        {
            ic[ix] = ia[ix] + ib[ix];
        }
        ia += nx;
        ib += nx;
        ic += nx;
    }
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;
    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            std::cout << "host" << hostRef[i] << "gpu" << gpuRef[i] << std::endl;
            break;
        }
    }
    if (match)
        std::cout<<"Arrays match."<<std::endl;
    else
        std::cout<<"Arrays do not match."<<std::endl;
}

// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny)
        MatC[idx] = MatA[idx] + MatB[idx];
}

int main(int argc, char **argv)
{
    clock_t startc, stopc;
    double cpu_time_used;
    cudaEvent_t start, stop;
    float gpu_time_used = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // set up device
    int dev = 0;//
    cudaDeviceProp deviceProp;//
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));//
    std::cout << "Using Device " << dev << " " << deviceProp.name << std::endl;// 0 NVIDIA GeForce RTX 3060 Laptop GPU
    CHECK(cudaSetDevice(dev));//
    cudaSetDevice(dev);//

    // set up data size of matrix
    int nx = 1 << 14;// 16384
    int ny = 1 << 14;//
    int nxy = nx * ny;//
    int nBytes = nxy * sizeof(float);//
    std::cout << "Matrix size: nx=" << nx << " ny=" << ny << std::endl;

    // malloc host memory
    startc = clock();
    cudaEventRecord(start, 0);
    float *h_A, *h_B, *h_S, *d_S;//
    h_A = (float *)malloc(nBytes);//
    h_B = (float *)malloc(nBytes);//
    h_S = (float *)malloc(nBytes);//
    d_S = (float *)malloc(nBytes);//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "malloc() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "malloc() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // initialize matrix at host side
    startc = clock();
    cudaEventRecord(start, 0);
    initialData(h_A, nxy);//
    initialData(h_B, nxy);//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "matrix initialization elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "matrix initialization elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    startc = clock();
    cudaEventRecord(start, 0);
    memset(h_S, 0, nBytes);//
    memset(d_S, 0, nBytes);//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "memset() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "memset() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // add matrix at host side for result checks
    startc = clock();
    cudaEventRecord(start, 0);
    sumMatrixOnHost(h_A, h_B, h_S, nx, ny);//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "sumMatrixOnHost() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "sumMatrixOnHost() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // malloc device global memory
    float *d_MatA, *d_MatB, *d_MatC;
    startc = clock();
    cudaEventRecord(start, 0);
    CHECK(cudaMalloc((void **)&d_MatA, nBytes));//
    CHECK(cudaMalloc((void **)&d_MatB, nBytes));//
    CHECK(cudaMalloc((void **)&d_MatC, nBytes));//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "cudaMalloc() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "cudaMalloc() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // transfer data from host to device
    startc = clock();
    cudaEventRecord(start, 0);
    CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));//
    CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "cudaMemcpyHostToDevice elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "cudaMemcpyHostToDevice elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
    CHECK(cudaGetLastError());//

    // invoke kernel at host side
    int dimx = 32;
    int dimy = 32;
    dim3 block(dimx, dimy);
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

    startc = clock();
    cudaEventRecord(start, 0);
    sumMatrixOnGPU2D << <grid, block >> > (d_MatA, d_MatB, d_MatC, nx, ny);//
    CHECK(cudaDeviceSynchronize());// shoule be called after kernel called?
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "sumMatrixOnGPU2D<<<(" << grid.x << grid.y << "), (" << block.x << block.y << ")>>> elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
    std::cout << "sumMatrixOnGPU2D<<<(" << grid.x << grid.y << "), (" << block.x << block.y << ")>>> elapsed " << cpu_time_used << " sec on CPU" << std::endl;

    // check kernel error
    startc = clock();
    cudaEventRecord(start, 0);
    CHECK(cudaGetLastError());//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "cudaGetLastError() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "cudaGetLastError() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // copy kernel result back to host side
    startc = clock();
    cudaEventRecord(start, 0);
    CHECK(cudaMemcpy(d_S, d_MatC, nBytes, cudaMemcpyDeviceToHost));//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "cudaMemcpyDeviceToHost elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "cudaMemcpyDeviceToHost elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // check device results
    startc = clock();
    cudaEventRecord(start, 0);
    checkResult(h_S, d_S, nxy);//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "check results elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "check results elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // free device global memory
    startc = clock();
    cudaEventRecord(start, 0);
    CHECK(cudaFree(d_MatA));//
    CHECK(cudaFree(d_MatB));//
    CHECK(cudaFree(d_MatC));//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "cudaFree() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "cudaFree() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // free host memory
    startc = clock();
    cudaEventRecord(start, 0);
    free(h_A);//
    free(h_B);//
    free(h_S);//
    free(d_S);//
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
    std::cout << "free() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
    std::cout << "free() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;

    // reset device
    startc = clock();
    CHECK(cudaDeviceReset());// cuda reset
    stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
    std::cout << "cudaDeviceReset() elapsed " << cpu_time_used << " sec on CPU" << std::endl;

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;//
}

And the "common.h" simply contains the #define of CHECK as below.

#define CHECK(call)                                                                         \
{                                                                                           \
    const cudaError_t error = call;                                                         \
    if (error != cudaSuccess)                                                               \
    {                                                                                       \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                              \
        fprintf(stderr, "code: %d, reason: %s\n", error,                                    \
            cudaGetErrorString(error));                                                     \
        exit(1);                                                                            \
    }                                                                                       \
}                                                                                           \

Upvotes: 0

Views: 592

Answers (1)

talonmies
talonmies

Reputation: 72342

This sequence of calls:

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// ...

CHECK(cudaDeviceReset());

// ...

cudaEventDestroy(start);
cudaEventDestroy(stop);

is the problem. cudaDeviceReset() destroys the current runtime API context, and when you do that, all the resources which were created within that context are destroyed or invalidated. That includes the events you created. It is incorrect to try and use those events (even destroying them) after the context is destroyed.

The reason why it sometimes works is probably because some of the internal context destruction process is done by worker threads asynchronously after the cudaDeviceReset call. Depending on how fast the context destruction is, the event destruction might accidentally succeed because the internal resources they have used have not yet been released. But that should be regarded as the exception rather than the rule.

Upvotes: 1

Related Questions