Reputation: 73
Below is the GPU kernel snippet :
__global_ void POCKernel(int *a)
{
int i = threadIdx.x;
a[i] = a[i] + 1;
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
//double* dblNewMemoryVarA = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarA, 0, 15 * sizeof(double));
//double* dblNewMemoryVarB = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarB, 0, 15 * sizeof(double));
//double* dblNewMemoryVarC = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarC, 0, 15 * sizeof(double));
for (int j = 0; j < 15; j++)
{
dblNewMemoryVarA[j] = 0;
dblNewMemoryVarB[j] = 0;
dblNewMemoryVarC[j] = 0;
}
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] * dblNewMemoryVarB[i];
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] - dblNewMemoryVarB[i];
/*free(dblNewMemoryVarA);
free(dblNewMemoryVarB);
free(dblNewMemoryVarC);*/
}
}
The calling function to this kernel is :
int main()
{
const int arraySize = 1024 * 1024;
int* a = new int[arraySize];
int *dev_a = 0;
for (int i = 0; i < arraySize; i++)
{
a[i] = 5;
}
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "CUDA failed!");
return 1;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
POCKernel << <4096, 256 >> >(dev_a);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
Error:
cudaFree(dev_a);
return 0;
}
}
On cudaDeviceSynchronize, the error code is 4 - unspecified launch failure. Can someone please tell me why am I facing this issue?
Upvotes: 0
Views: 11005
Reputation: 152279
This code is strange in a number of ways but let's get to the point. A definite problem is here in these lines of kernel code:
int i = threadIdx.x;
...
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
...
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
You are launching threadblocks of 256 threads each:
POCKernel << <4096, 256 >> >(dev_a);
^^^
That means your threadIdx.x
variable will range from 0 to 255 across all the threads in a block:
int i = threadIdx.x;
In your local variables, you've allocated space for 15 quantities:
double dblNewMemoryVarA[15];
But you then try to index into these arrays using i
, which as previously noted will range up to 255:
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
so that will generate out-of-bounds indexing, which could very well lead to a kernel launch failure.
It's impossible to say for sure, since you haven't provided a complete code nor indicated how you are compiling or what environment you are running in. But the above is certainly illegal from a code correctness standpoint.
My guess would be you are compiling in debug mode (-G
). If not, I would expect the compiler to optimize everything after the if test away, as none of that code affects any global state.
And as pointed out in the comments, it may simply be that you are running into a windows WDDM timeout if you are running this one windows.
Upvotes: 2