OpenCL for loop gives CL_OUT_OF_RESOURCES

Question

So I am currently passing in 2 GPULevel's and I want the kernel to take each of them and then if in the array that levels has arr1 I want to check if the value is >= 0 and if it is change the value.

My origonal code kernel code was this:

typedef struct GPULevelDef
{
    int nInput, nOutput;
    float arr1[100];
    float arr2[100];
}GPULevel;

__kernel void levelComposition(__global GPULevel *lLevels, __global GPULevel *oLevels, __global int *LCount)
{
    int lIndex = get_global_id(1);
    int wIndex = get_global_id(0);
    int wCount = 0;
    if(lIndex < LCount)
    {
        wCount = lLevels[lIndex].nInput*lLevels[lIndex].nOutput;
        if(wIndex < wCount)
        {
            if(lLevels[lIndex].arr1[wIndex] >= 0)
            {
                oLevels[lIndex].arr1[wIndex] = (lLevels[lIndex].arr1[wIndex]) + 350;
            }
        }
    }
}

However, it would give me really weird results as the first returned GPULevel returned would be correct and the second one would have only nInput returned as the correct value and the rest would be wrong.

This is what I really want to do on the kernel side but I get a CL_OUT_OF_RESOURCES returned as soon as I add in a for loop even if I strip it back and add just one for experimentation reasons.

Desired kernel:

typedef struct GPULevelDef
{
    int nInput, nOutput;
    float arr1[100];
    float arr2[100];
}GPULevel;

__kernel void levelComposition(__global GPULevel *lLevels, __global GPULevel *oLevels, __global int *lCount )
{
    for(int lIndex = get_global_id(0); lIndex < lCount; lIndex++)
    {
        int wCount = lLevels[lIndex].nInput*lLevels[lIndex].nOutput;
        for(int wIndex = get_global_id(0); wIndex < wCount; wIndex++)
        {
            if(lLevels[lIndex].arr1[wIndex] >= 0)
            {
                oLevels[lIndex].arr1[wIndex] = (lLevels[lIndex].arr1[wIndex]) + 350;
            }
        }
    }
}

The following is the important host code:

GPULevel* levelIn = (GPULevel*)malloc(sizeof(GPULevel)*levelCount);
GPULevel* levelOut = (GPULevel*)malloc(sizeof(GPULevel)*levelCount);

size_t dataSize = sizeof(GPULevel)*levelCount;
layerBuffer = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
err = clEnqueueWriteBuffer(queue,layerBuffer,CL_TRUE,0,dataSize,(void*)layerIn,0,NULL,NULL);
cl_mem bufferB = clCreateBuffer(gpu.context,CL_MEM_WRITE_ONLY,dataSize,NULL,&err);
err = clEnqueueWriteBuffer(queue,bufferB,CL_TRUE,0,dataSize,(void*)layerOut,0,NULL,NULL);


GPULayer* val1 = (GPULevel*)calloc(sizeof(levelIn), sizeof(GPULevel));
GPULayer* val2 = (GPULevel*)calloc(sizeof(levelOut), sizeof(GPULevel));
err = clEnqueueReadBuffer(queue, layerBuffer, CL_TRUE, 0, dataSize, val1, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, bufferB, CL_TRUE, 0, dataSize, val2, 0, NULL, NULL);

So to summarise: I have used the first kernel as I thought that this would give me the result I want as I was under the impression that it is a parallel implementation. I do find it odd the that get_global_id() does need to be 1 for lIndex and 0 for wIndex to get it to work properly (Otherwise it produces wrong results for both again). So as this original kernel screws up on the second level I created a second kernel. In this second kernel it is exactly what I want to have implemented but for some reason the introduction of the for loop causes the CL_OUT_OF_RESOURCES error (-5). I need to know which kernel should I be using and persevering with and how to get what I want

Thanks

Not sure if this diagram helps as well

levels[0]
    nInput = 2
    nOutput = 5
    arr1  [0] = 2
     arr1 [1] = 7
     arr1 [...] = -32
     arr1 [n] = -1
    arr2  [0] = 3
     arr2 [1] = -2
     arr2 [...] = 5
     arr2 [n] = -3

levels[1]
    nInput = 5
    nOutput = 1
    arr1  [0] = 3
     arr1 [1] = 7
     arr1 [...] = 72
     arr1 [n] = -1
    arr2  [0] = 5
     arr2 [1] = -2
     arr2 [...] = 1
     arr2 [n] = -1



  Parallel           Parallel

              ------->oLevels[0].arr1[0] =lLevels[0].arr1[0] +350
   lLevels[0] ------->oLevels[0].arr1[1] =lLevels[0].arr1[1] +350
              ------->oLevels[0].arr1[...] NOTHING
              ------->oLevels[0].arr1[n] NOTHING

              ------->oLevels[1].arr1[0] =lLevels[0].arr1[0] +350
   lLevels[1] ------->oLevels[1].arr1[1] =lLevels[0].arr1[1] +350
              ------->oLevels[1].arr1[...] =lLevels[0].arr1[...] +350
              ------->oLevels[1].arr1[n] NOTHING

OpenCL for loop gives CL_OUT_OF_RESOURCES

Answers (1)

Related Questions