Reputation: 33
in my code I create a host variable
h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));
coountlog is a variable which basically denotes the row size of a 2d array(which I am implementing as a 1D array)
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
here is how I call CUDA in the main program
//free cuda memory from previous call
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
//free cuda
The kernel call is as follows
__global__ void calcstats2(double *d4_out, double *d4_in, int size)
int idx = blockDim.x*blockIdx.x + threadIdx.x;
double X, A, B, C, D, BX, BA, BB, BC;
if (idx < 4)
d4_out[idx * 23 + 0] = -1;
d4_out[idx * 23 + 1] = -1;
d4_out[idx * 23 + 2] = -1;
d4_out[idx * 23 + 3] = -1;
d4_out[idx * 23 + 4] = -1;
d4_out[idx * 23 + 5] = -1;
d4_out[idx * 23 + 6] = -1;
d4_out[idx * 23 + 7] = -1;
d4_out[idx * 23 + 8] = -1;
d4_out[idx * 23 + 9] = -1;
d4_out[idx * 23 + 10] = -1;
d4_out[idx * 23 + 11] = -1;
d4_out[idx * 23 + 12] = -1;
d4_out[idx * 23 + 13] = -1;
d4_out[idx * 23 + 14] = -1;
d4_out[idx * 23 + 15] = -1;
d4_out[idx * 23 + 16] = -1;
d4_out[idx * 23 + 17] = -1;
d4_out[idx * 23 + 18] = -1;
d4_out[idx * 23 + 19] = -1;
d4_out[idx * 23 + 20] = -1;
d4_out[idx * 23 + 21] = -1;
d4_out[idx * 23 + 22] = -1;
X = d4_in[idx * 2 - 8];
A = d4_in[idx * 2 - 6];
B = d4_in[idx * 2 - 4];
C = d4_in[idx * 2 - 2];
D = d4_in[idx * 2 - 0];
BX = d4_in[idx * 2 - 5];
BA = d4_in[idx * 2 - 3];
BB = d4_in[idx * 2 - 1];
BC = d4_in[idx * 2 + 1];
//start the stats calcs here
d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
d4_out[idx * 23 + 13] = BC;
d4_out[idx * 23 + 14] = BB;
d4_out[idx * 23 + 15] = BA;
d4_out[idx * 23 + 16] = BX;
d4_out[idx * 23 + 17] = BB + BC;
d4_out[idx * 23 + 18] = BA + BB + BC;
d4_out[idx * 23 + 19] = BX + BA + BB + BC;
d4_out[idx * 23 + 20] = BA + BB;
d4_out[idx * 23 + 21] = BX + BA + BB;
d4_out[idx * 23 + 22] = BX + BA;
I am getting an error in the cudamemcppy device to host and cudadevicesynchronise that an illegal memory access was encountered. Following stack overflow help I corrected my code to make it a 1D array, i have allocated same memory to both host and device arrays. The strange thing is
this program runs successfully on smaller files (the input is an OHLC data) but gives this error on larger files
even for the larger file there are 3 other kernel calls which run successfully without any issue.
Any help will be greatly appreciated.
Thanks in Advance
PS I am using a single GTX 760 card (ASUS vendor : with 2GB memory. Also cuda version is 7. IDE is VS 2013.
Upvotes: 0
Views: 116
Reputation: 152173
You are (possibly) launching more threads than you actually need:
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
And you have no thread check for this condition in your kernel. Threads numbered higher than countlog
will access your arrays out-of-bounds.
Try changing the else statement in your kernel to:
else if (idx < size)
Upvotes: 1