user3082499
user3082499

Reputation: 31

invalid device function when execution a kernel

I'm trying to execute a kernel running the gabor filter and I get this error

/Gabor_Cuda/gaborMax.cu(2387) : getLastCudaError() CUDA error : convolutionColumnGaborMaxGPU() execution failed : (8) invalid device function 

this the line "2387" of the file "gaborMax.cu" :

getLastCudaError("convolutionColumnGaborMaxGPU() execution failed\n");

and the kernel it refers to is this one :

convolutionGaborMaxGPU<<<blockGridColumns, threadBlockColumns>>>(d_Input, d_Result0,        d_Result1,d_Result2,d_Result3,d_Result4,d_Result5,d_Result6,d_Result7,d_Result8,d_Result9,d_Result10,d_Result11,d_Result12,d_Result13,d_Result14,d_Result15,DATA_W,DATA_H,  loadsPerThread,loadsPerThread);

i'm wandering if i should post or not the code of the kernel for it's a more tha 1500 code lines i think it would have been better for me to post the file "gaborMax.cu" but anyway this is the code of the kernel

__global__ void convolutionGaborMaxGPU(
    float *d_Input,
    float *d_Result0,
    float *d_Result1,
    float *d_Result2,
    float *d_Result3,
    float *d_Result4,
    float *d_Result5,
    float *d_Result6,
    float *d_Result7,
    float *d_Result8,
    float *d_Result9,
    float *d_Result10,
    float *d_Result11,
    float *d_Result12,
    float *d_Result13,
    float *d_Result14,
    float *d_Result15,
    int dataW,
    int dataH,
    int loadsPerThreadX,
    int loadsPerThreadY
){

    const int smemSize = SUBPICW * SUBPICW;
    const int smemYOffset = IMUL(threadIdx.y, SUBPICW);
    const int smemYBlockOffset = IMUL(blockDim.y, SUBPICW);
    const int yOffset = IMUL(threadIdx.y, dataW);
    const int localYBlockOffset = IMUL(blockDim.y, dataW);
    const int globalYBlockOffset = IMUL(blockIdx.y, blockDim.y * dataW );
    const int xBlockOffset = IMUL(blockIdx.x, blockDim.x);
    //const int apronOffset = (APRON0 * dataW) - APRON0;

    __shared__ float data[SUBPICW*SUBPICW];

    int currentXIdx = 0;
    int smemPos = 0;
    int smemPosData = 0;
    int gmemPos = 0;
    int gmemPosData = 0;

    for (int k = 0; k < loadsPerThreadY; k++) 
    { 
        for (int l = 0; l < loadsPerThreadX; l++) 
        {
            currentXIdx = threadIdx.x + (l*blockDim.x); 
            if (currentXIdx < SUBPICW)
                { 
                smemPos = currentXIdx + smemYOffset + (k * smemYBlockOffset); 
            if (smemPos < smemSize) 
                        { 
                    gmemPos = currentXIdx + xBlockOffset; 
                    if (gmemPos - APRON0 >= dataW)
                                {
                        gmemPos = dataW + APRON0;
                    }
                    else if (gmemPos < APRON0) {
                        gmemPos = APRON0;
                    }

                    gmemPos+= (yOffset + globalYBlockOffset + (k * localYBlockOffset) - (APRON0 * dataW) - APRON0); 
                    if (gmemPos < APRON0) 
                                {
                        gmemPos = APRON0;
                    }
                    else if (gmemPos >= dataW*dataH) 
                {
                        gmemPos = dataW*dataH - 1;
                    }
                data[smemPos] = d_Input[gmemPos];
                }
            }
        }
    }

    __syncthreads();

    smemPosData = threadIdx.x + smemYOffset + APRON0 + (APRON0 * SUBPICW);
    //smemPosData = threadIdx.x + ((threadIdx.y) * SUBPICW);
    gmemPosData = threadIdx.x + xBlockOffset
            + yOffset + globalYBlockOffset;

/////////////////////////////////////////////////////////////////////////////////calculate 1st filter convolution

    float sum0 = 0;
    #ifdef UNROLL_INNER
    sum0 = convolutionGaborMax18<2 * KERNEL_RADIUS0>(data + smemPosData, d_Kernel0);
    #else
    for (int k = -KERNEL_RADIUS0; k <= KERNEL_RADIUS0; k++) {

        sum0 += data[smemPosData -18 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -18)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -17 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -17)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -16 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -16)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -15 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -15)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -14 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -14)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -13 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -13)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -12 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -12)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -11 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -11)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -10 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -10)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -9 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -9)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -8 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -8)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -7 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -7)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -6 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -6)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -5 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -5)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -4 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -4)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -3 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -3)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -2 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -2)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData -1 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 -1)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +1 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +1)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +2 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +2)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +3 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +3)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +4 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +4)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +5 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +5)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +6 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +6)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +7 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +7)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +8 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +8)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +9 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +9)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +10 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +10)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +11 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +11)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +12 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +12)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +13 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +13)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +14 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +14)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +15 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +15)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +16 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +16)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +17 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +17)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
        sum0 += data[smemPosData +18 + (k*SUBPICW)]
                    * d_Kernel0[(KERNEL_RADIUS0 +18)
                    + ((KERNEL_RADIUS0 + k)*KERNEL_W0)];

    }
#endif
    sum0 /= 18;


    d_Result0[gmemPosData] = sum0;
    d_Result1[gmemPosData] = sum1;
    d_Result2[gmemPosData] = sum2;
    d_Result3[gmemPosData] = sum3;
    d_Result4[gmemPosData] = sum4;
    d_Result5[gmemPosData] = sum5;
    d_Result6[gmemPosData] = sum6;
    d_Result7[gmemPosData] = sum7;
    d_Result8[gmemPosData] = sum8;
    d_Result9[gmemPosData] = sum9;
    d_Result10[gmemPosData] = sum10;
    d_Result11[gmemPosData] = sum11;
    d_Result12[gmemPosData] = sum12;
    d_Result13[gmemPosData] = sum13;
    d_Result14[gmemPosData] = sum14;
    d_Result15[gmemPosData] = sum15;
}

I do the same calculation for all the 15 others filters convolution, from which i have sum1, ....., sum15

I'm runing my code on a 32 bits and don't know if the code needs to be executed on a 64 bits computer but I don't understand the meaning of this error.

Upvotes: 2

Views: 606

Answers (1)

Donna
Donna

Reputation: 41

I know this question is old, but this answer may help others who have had the same type of error message. I recommend checking what architecture you are compiling your cuda code with, i.e.

-gencode arch=compute_20,code=sm_20

Try running deviceQuery from the CUDA samples to make sure you set that correctly.

Upvotes: 1

Related Questions