Reputation: 11
When I am running a CUDA program, I met "0xC0000005: Access violation reading location 0x0000000000000018." at cudaEventDestroy().
It is confusing because this exception sometimes appears and sometimes not (if not, the program runs normally without any error).
start
& stop
is defined as cudaEvent_t start, stop;
to compute time elapsed on GPU.
Sometimes the Exception looks like: (at a different location)
Here I attach the whole program code, in which 2 matrices A & B add together to get S on both GPU and CPU, and compare the results.
I am new to CUDA, I will appreciate it a lot if somebody may explain this to me.
Note: the original code is provided by book "professional CUDA C programming".
#include <cmath>
#include <iostream>
#include "common.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void initialData(float *ip, const int size)
{
for(int i = 0; i < size; i++)
{
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float* ia = A;
float* ib = B;
float* ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
std::cout << "host" << hostRef[i] << "gpu" << gpuRef[i] << std::endl;
break;
}
}
if (match)
std::cout<<"Arrays match."<<std::endl;
else
std::cout<<"Arrays do not match."<<std::endl;
}
// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
MatC[idx] = MatA[idx] + MatB[idx];
}
int main(int argc, char **argv)
{
clock_t startc, stopc;
double cpu_time_used;
cudaEvent_t start, stop;
float gpu_time_used = 0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// set up device
int dev = 0;//
cudaDeviceProp deviceProp;//
CHECK(cudaGetDeviceProperties(&deviceProp, dev));//
std::cout << "Using Device " << dev << " " << deviceProp.name << std::endl;// 0 NVIDIA GeForce RTX 3060 Laptop GPU
CHECK(cudaSetDevice(dev));//
cudaSetDevice(dev);//
// set up data size of matrix
int nx = 1 << 14;// 16384
int ny = 1 << 14;//
int nxy = nx * ny;//
int nBytes = nxy * sizeof(float);//
std::cout << "Matrix size: nx=" << nx << " ny=" << ny << std::endl;
// malloc host memory
startc = clock();
cudaEventRecord(start, 0);
float *h_A, *h_B, *h_S, *d_S;//
h_A = (float *)malloc(nBytes);//
h_B = (float *)malloc(nBytes);//
h_S = (float *)malloc(nBytes);//
d_S = (float *)malloc(nBytes);//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "malloc() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "malloc() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// initialize matrix at host side
startc = clock();
cudaEventRecord(start, 0);
initialData(h_A, nxy);//
initialData(h_B, nxy);//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "matrix initialization elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "matrix initialization elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
startc = clock();
cudaEventRecord(start, 0);
memset(h_S, 0, nBytes);//
memset(d_S, 0, nBytes);//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "memset() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "memset() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// add matrix at host side for result checks
startc = clock();
cudaEventRecord(start, 0);
sumMatrixOnHost(h_A, h_B, h_S, nx, ny);//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "sumMatrixOnHost() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "sumMatrixOnHost() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// malloc device global memory
float *d_MatA, *d_MatB, *d_MatC;
startc = clock();
cudaEventRecord(start, 0);
CHECK(cudaMalloc((void **)&d_MatA, nBytes));//
CHECK(cudaMalloc((void **)&d_MatB, nBytes));//
CHECK(cudaMalloc((void **)&d_MatC, nBytes));//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "cudaMalloc() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "cudaMalloc() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// transfer data from host to device
startc = clock();
cudaEventRecord(start, 0);
CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));//
CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "cudaMemcpyHostToDevice elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "cudaMemcpyHostToDevice elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
CHECK(cudaGetLastError());//
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
startc = clock();
cudaEventRecord(start, 0);
sumMatrixOnGPU2D << <grid, block >> > (d_MatA, d_MatB, d_MatC, nx, ny);//
CHECK(cudaDeviceSynchronize());// shoule be called after kernel called?
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "sumMatrixOnGPU2D<<<(" << grid.x << grid.y << "), (" << block.x << block.y << ")>>> elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
std::cout << "sumMatrixOnGPU2D<<<(" << grid.x << grid.y << "), (" << block.x << block.y << ")>>> elapsed " << cpu_time_used << " sec on CPU" << std::endl;
// check kernel error
startc = clock();
cudaEventRecord(start, 0);
CHECK(cudaGetLastError());//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "cudaGetLastError() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "cudaGetLastError() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// copy kernel result back to host side
startc = clock();
cudaEventRecord(start, 0);
CHECK(cudaMemcpy(d_S, d_MatC, nBytes, cudaMemcpyDeviceToHost));//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "cudaMemcpyDeviceToHost elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "cudaMemcpyDeviceToHost elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// check device results
startc = clock();
cudaEventRecord(start, 0);
checkResult(h_S, d_S, nxy);//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "check results elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "check results elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// free device global memory
startc = clock();
cudaEventRecord(start, 0);
CHECK(cudaFree(d_MatA));//
CHECK(cudaFree(d_MatB));//
CHECK(cudaFree(d_MatC));//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "cudaFree() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "cudaFree() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// free host memory
startc = clock();
cudaEventRecord(start, 0);
free(h_A);//
free(h_B);//
free(h_S);//
free(d_S);//
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpu_time_used, start, stop);
std::cout << "free() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
std::cout << "free() elapsed " << 1e-3 * gpu_time_used << " sec on GPU" << std::endl;
// reset device
startc = clock();
CHECK(cudaDeviceReset());// cuda reset
stopc = clock(); cpu_time_used = ((double)(stopc - startc)) / CLOCKS_PER_SEC;
std::cout << "cudaDeviceReset() elapsed " << cpu_time_used << " sec on CPU" << std::endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;//
}
And the "common.h" simply contains the #define of CHECK as below.
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
} \
Upvotes: 0
Views: 592
Reputation: 72342
This sequence of calls:
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// ...
CHECK(cudaDeviceReset());
// ...
cudaEventDestroy(start);
cudaEventDestroy(stop);
is the problem. cudaDeviceReset()
destroys the current runtime API context, and when you do that, all the resources which were created within that context are destroyed or invalidated. That includes the events you created. It is incorrect to try and use those events (even destroying them) after the context is destroyed.
The reason why it sometimes works is probably because some of the internal context destruction process is done by worker threads asynchronously after the cudaDeviceReset
call. Depending on how fast the context destruction is, the event destruction might accidentally succeed because the internal resources they have used have not yet been released. But that should be regarded as the exception rather than the rule.
Upvotes: 1