Reputation: 999
So I am currently passing in 2 GPULevel's and I want the kernel to take each of them and then if in the array that levels
has arr1
I want to check if the value is >= 0 and if it is change the value.
My origonal code kernel code was this:
typedef struct GPULevelDef
{
int nInput, nOutput;
float arr1[100];
float arr2[100];
}GPULevel;
__kernel void levelComposition(__global GPULevel *lLevels, __global GPULevel *oLevels, __global int *LCount)
{
int lIndex = get_global_id(1);
int wIndex = get_global_id(0);
int wCount = 0;
if(lIndex < LCount)
{
wCount = lLevels[lIndex].nInput*lLevels[lIndex].nOutput;
if(wIndex < wCount)
{
if(lLevels[lIndex].arr1[wIndex] >= 0)
{
oLevels[lIndex].arr1[wIndex] = (lLevels[lIndex].arr1[wIndex]) + 350;
}
}
}
}
However, it would give me really weird results as the first returned GPULevel returned would be correct and the second one would have only nInput returned as the correct value and the rest would be wrong.
This is what I really want to do on the kernel side but I get a CL_OUT_OF_RESOURCES returned as soon as I add in a for loop even if I strip it back and add just one for experimentation reasons.
Desired kernel:
typedef struct GPULevelDef
{
int nInput, nOutput;
float arr1[100];
float arr2[100];
}GPULevel;
__kernel void levelComposition(__global GPULevel *lLevels, __global GPULevel *oLevels, __global int *lCount )
{
for(int lIndex = get_global_id(0); lIndex < lCount; lIndex++)
{
int wCount = lLevels[lIndex].nInput*lLevels[lIndex].nOutput;
for(int wIndex = get_global_id(0); wIndex < wCount; wIndex++)
{
if(lLevels[lIndex].arr1[wIndex] >= 0)
{
oLevels[lIndex].arr1[wIndex] = (lLevels[lIndex].arr1[wIndex]) + 350;
}
}
}
}
The following is the important host code:
GPULevel* levelIn = (GPULevel*)malloc(sizeof(GPULevel)*levelCount);
GPULevel* levelOut = (GPULevel*)malloc(sizeof(GPULevel)*levelCount);
size_t dataSize = sizeof(GPULevel)*levelCount;
layerBuffer = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
err = clEnqueueWriteBuffer(queue,layerBuffer,CL_TRUE,0,dataSize,(void*)layerIn,0,NULL,NULL);
cl_mem bufferB = clCreateBuffer(gpu.context,CL_MEM_WRITE_ONLY,dataSize,NULL,&err);
err = clEnqueueWriteBuffer(queue,bufferB,CL_TRUE,0,dataSize,(void*)layerOut,0,NULL,NULL);
GPULayer* val1 = (GPULevel*)calloc(sizeof(levelIn), sizeof(GPULevel));
GPULayer* val2 = (GPULevel*)calloc(sizeof(levelOut), sizeof(GPULevel));
err = clEnqueueReadBuffer(queue, layerBuffer, CL_TRUE, 0, dataSize, val1, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, bufferB, CL_TRUE, 0, dataSize, val2, 0, NULL, NULL);
So to summarise: I have used the first kernel as I thought that this would give me the result I want as I was under the impression that it is a parallel implementation. I do find it odd the that get_global_id() does need to be 1 for lIndex and 0 for wIndex to get it to work properly (Otherwise it produces wrong results for both again). So as this original kernel screws up on the second level I created a second kernel. In this second kernel it is exactly what I want to have implemented but for some reason the introduction of the for loop causes the CL_OUT_OF_RESOURCES error (-5). I need to know which kernel should I be using and persevering with and how to get what I want
Thanks
Not sure if this diagram helps as well
levels[0]
nInput = 2
nOutput = 5
arr1 [0] = 2
arr1 [1] = 7
arr1 [...] = -32
arr1 [n] = -1
arr2 [0] = 3
arr2 [1] = -2
arr2 [...] = 5
arr2 [n] = -3
levels[1]
nInput = 5
nOutput = 1
arr1 [0] = 3
arr1 [1] = 7
arr1 [...] = 72
arr1 [n] = -1
arr2 [0] = 5
arr2 [1] = -2
arr2 [...] = 1
arr2 [n] = -1
Parallel Parallel
------->oLevels[0].arr1[0] =lLevels[0].arr1[0] +350
lLevels[0] ------->oLevels[0].arr1[1] =lLevels[0].arr1[1] +350
------->oLevels[0].arr1[...] NOTHING
------->oLevels[0].arr1[n] NOTHING
------->oLevels[1].arr1[0] =lLevels[0].arr1[0] +350
lLevels[1] ------->oLevels[1].arr1[1] =lLevels[0].arr1[1] +350
------->oLevels[1].arr1[...] =lLevels[0].arr1[...] +350
------->oLevels[1].arr1[n] NOTHING
Upvotes: 0
Views: 1104
Reputation: 9906
LCount
is a pointer to int, and you are using it as an integer. Your loop probably goes out of range.
CL_OUT_OF_RESOURCES
is often an indication of out of range addressing.
Your OpenCL compiler should have emitted a warning. You may want to check the string returned by clGetProgramBuildInfo(...,CL_PROGRAM_BUILD_LOG,...).
Upvotes: 1