Abhishek Ray
Abhishek Ray

Reputation: 33

Cuda error illegal memory referenced in devicesync and cudamemcopy

in my code I create a host variable

h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));

coountlog is a variable which basically denotes the row size of a 2d array(which I am implementing as a 1D array)

//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
    h4_in[count * 2 + 0] = prc[count];
    h4_in[count * 2 + 1] = h_stat1out[count * 6];
}

here is how I call CUDA in the main program

//free cuda memory from previous call
cudaFree(d3_in);
cudaFree(d3_out);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
}
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
    h4_in[count * 2 + 0] = prc[count];
    h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size 
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
}
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
}
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
}
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
}
//free cuda
cudaFree(d4_in);
cudaFree(d4_out);

The kernel call is as follows

__global__ void calcstats2(double *d4_out, double *d4_in, int size)
{
    int idx = blockDim.x*blockIdx.x + threadIdx.x;
    double X, A, B, C, D, BX, BA, BB, BC;
    if (idx < 4)
    {
        d4_out[idx * 23 + 0] = -1;
        d4_out[idx * 23 + 1] = -1;
        d4_out[idx * 23 + 2] = -1;
        d4_out[idx * 23 + 3] = -1;
        d4_out[idx * 23 + 4] = -1;
        d4_out[idx * 23 + 5] = -1;
        d4_out[idx * 23 + 6] = -1;
        d4_out[idx * 23 + 7] = -1;
        d4_out[idx * 23 + 8] = -1;
        d4_out[idx * 23 + 9] = -1;
        d4_out[idx * 23 + 10] = -1;
        d4_out[idx * 23 + 11] = -1;
        d4_out[idx * 23 + 12] = -1;
        d4_out[idx * 23 + 13] = -1;
        d4_out[idx * 23 + 14] = -1;
        d4_out[idx * 23 + 15] = -1;
        d4_out[idx * 23 + 16] = -1;
        d4_out[idx * 23 + 17] = -1;
        d4_out[idx * 23 + 18] = -1;
        d4_out[idx * 23 + 19] = -1;
        d4_out[idx * 23 + 20] = -1;
        d4_out[idx * 23 + 21] = -1;
        d4_out[idx * 23 + 22] = -1;
    }
    else
    {
        X = d4_in[idx * 2 - 8];
        A = d4_in[idx * 2 - 6];
        B = d4_in[idx * 2 - 4];
        C = d4_in[idx * 2 - 2];
        D = d4_in[idx * 2 - 0];
        BX = d4_in[idx * 2 - 5];
        BA = d4_in[idx * 2 - 3];
        BB = d4_in[idx * 2 - 1];
        BC = d4_in[idx * 2 + 1];
        //start the stats calcs here
        d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
        d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
        d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
        d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
        d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
        d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
        d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
        d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
        d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
        d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
        d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
        d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
        d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
        d4_out[idx * 23 + 13] = BC;
        d4_out[idx * 23 + 14] = BB;
        d4_out[idx * 23 + 15] = BA;
        d4_out[idx * 23 + 16] = BX;
        d4_out[idx * 23 + 17] = BB + BC;
        d4_out[idx * 23 + 18] = BA + BB + BC;
        d4_out[idx * 23 + 19] = BX + BA + BB + BC;
        d4_out[idx * 23 + 20] = BA + BB;
        d4_out[idx * 23 + 21] = BX + BA + BB;
        d4_out[idx * 23 + 22] = BX + BA;
    }
}

I am getting an error in the cudamemcppy device to host and cudadevicesynchronise that an illegal memory access was encountered. Following stack overflow help I corrected my code to make it a 1D array, i have allocated same memory to both host and device arrays. The strange thing is

  1. this program runs successfully on smaller files (the input is an OHLC data) but gives this error on larger files

  2. even for the larger file there are 3 other kernel calls which run successfully without any issue.

Any help will be greatly appreciated.

Thanks in Advance

Abhishek

PS I am using a single GTX 760 card (ASUS vendor :https://www.asus.com/Graphics-Cards/GTX760DC2OC2GD5/) with 2GB memory. Also cuda version is 7. IDE is VS 2013.

Upvotes: 0

Views: 116

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151799

You are (possibly) launching more threads than you actually need:

 blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;

And you have no thread check for this condition in your kernel. Threads numbered higher than countlog will access your arrays out-of-bounds.

Try changing the else statement in your kernel to:

else if (idx < size)

Upvotes: 1

Related Questions