Kernel launch failed: invalid argument ,cuda runtime error

Question

I'm trying to launch a kernel in the attached code. I'm getting the massage "kernel launched failed:invalid argument".

// System includes
#include 
#include 

// CUDA runtime
#include 

// Helper functions and utilities to work with CUDA
#include 

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
    if(cudaSuccess != err)
    {
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.
",file, line, (int)err, cudaGetErrorString( err ) );
        exit(-1);
    }
}

static const int MAX_FILTER_WIDTH = 7;

char *image_filename = "lena_bw_big.pgm";
char *out_filename = "lena_bw.out.pgm";
char *results_filename = "results.log";

// Loads filter configuration parameters from the command line
void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared)
{
    //forward declaration of a function that is being used here
    void parse_coefs(const char* coefs_txt, int radius, float* coefs);

    char* coefs_txt;

    if (argv==NULL || filt_width==NULL || factor==NULL || bias==NULL || coefs==NULL)
    {
        printf("Error: Bad params to load_coefs
");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "filter_width"))
    {
        *filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width");
        if (*filt_width < 1 || *filt_width >  MAX_FILTER_WIDTH || (*filt_width % 2) != 1)
        {
            printf("Error: Invalid filter width (%d)
",*filt_width);
            exit(-1);
        }
    }
    else
    {
        printf("Error: Filter width is not specified
");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "bias"))
        *bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias");
    else
    {
        printf("Error: Bias is not specified
");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "factor"))
        *factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor");
    else
    {
        printf("Error: Factor is not specified
");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "coefs"))
        getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt);

    parse_coefs(coefs_txt,*filt_width,coefs);

    if (checkCmdLineFlag(argc, (const char **)argv, "shared"))
        *use_shared = true;
    else
        *use_shared = false;
}

// Parse filter coefficients from string. The number of coefficients should be radius*radius.
void parse_coefs(const char* coefs_txt, int filt_width, float* coefs)
{
    const char* ptxt = coefs_txt;
    int skip_chars;

    memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float));

    for (int i = filt_width - 1; i >= 0; i--)
    {
        for (int j = filt_width - 1; j >= 0; j--)
        {
            if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1)
            {
                printf("Error: Not enough coefficients. Read %d/%d coefficients.
",i*filt_width+j,filt_width*filt_width);
                exit(-1);
            }
            ptxt += skip_chars+1;
        }
    }
}

__global__ void convolution2D_kernel(
        unsigned char* inputImage,
        unsigned char* outputImage,
        float* filter,
        int imageWidth,
        int imageHeight,
        int imagePitch,
        int filterWidth,
        float hfactor,
        float hbias
        )
{/*
    int idx=blockDim.x*blockIdx.x+threadIdx.x;
    int idy=blockDim.y*blockIdx.y+threadIdx.y;
    if(0=0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) {
                sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth];
            }
            //sum*=hfactor;
            //sum+=hbias;
            //sum=
                                                                                                                                                                                                                                                                                                                                                                                                                                                        //truncate values smaller than zero and larger than 255
            outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255);

        }
    }*/
}

__global__ void convolution2DShared_kernel(
        unsigned char* inputImage,
        unsigned char* outputImage,
        int imageWidth,
        int imageHeight,
        int imagePitch,
        int filterWidth
        )
{

}

void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height,
                int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared)
{
    // Allocate device memory
    unsigned char *d_in=NULL, *d_out=NULL;
    float *d_filter=NULL;

    int imgSize=sizeof(float)*width*height;
    int filterSize=sizeof(float)*hfilt_width*hfilt_width;

    int blockWidth=32;
    int gridx=width/blockWidth;
    if(width%blockWidth!=0)
        gridx++;
    printf("gridx size is %d
",gridx);
    int gridy=height/blockWidth;
    if(height%blockWidth!=0)
        gridy++;
    printf("gridy size is %d
",gridy);
    printf("blockWidth size is %d
",blockWidth);

    // measure execution time
    cudaEvent_t start,stop;
    const int iters = 10;

    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, NULL);

    printf("allocating mem
");
    cudaMalloc((void **) d_in, imgSize);
    cudaMalloc((void **) d_out, imgSize);
    cudaMalloc((void **) &d_filter, filterSize);


    cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice);

    // Setup execution parameters
    dim3 threads(blockWidth, blockWidth);       
    dim3 grid(gridx,gridy);
    printf("kernel starts
");
    // calculate execution time average over iters iterations
    for (int i=0; i>>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias);
        else
            convolution2DShared_kernel<<>>(d_in, d_out, width, height, width, hfilt_width);
    }

    checkCudaErrors(cudaEventRecord(stop, NULL));
    checkCudaErrors(cudaEventSynchronize(stop));

    // check for errors during kernel launch
    cudaError_t err;
    if ((err = cudaGetLastError()) != cudaSuccess)
    {
        printf("Kernel launch failed: %s",cudaGetErrorString(err));
        exit(1);
    }

    float msec = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msec, start, stop));

    printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms
",
            hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters);

    // write results to results file
    unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000};
    if (true != sdkWriteFile(results_filename,result_values,6,0,false,true))
    {
        printf("Error: Writing results file failed.");
        exit(1);
    }

    cudaFree(d_in);
    cudaFree(d_out);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height,
        int hfilt_width, float hfactor, float hbias, float* hcoefs)
{
    for(int x = 0; x < width; x++)
        for(int y = 0; y < height; y++)
        {
            float sum = 0.f;

            //multiply every value of the filter with corresponding image pixel
            for(int filterX = 0; filterX < hfilt_width; filterX++)
            for(int filterY = 0; filterY < hfilt_width; filterY++)
            {
                int imageX = x - hfilt_width / 2 + filterX;
                int imageY = y - hfilt_width / 2 + filterY;
                if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) {
                    sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH];
                }
            }

            //truncate values smaller than zero and larger than 255
            output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255);
        }
}


/**
* Program main
*/
int main(int argc, char **argv)
{
    unsigned char* h_inimg = NULL;
    unsigned char* h_outimg = NULL;
    unsigned char* h_refimg = NULL;
    unsigned int width, height;
    int hfilt_width = -1;
    float hfactor = 1.f, hbias = 0.f;
    float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
    bool use_shared = false;

    // load parameters of filter
    if (argc > 1)
        load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared);
    else {
        hfilt_width = 5;
        hfactor = 1.0f / 13.0f;
        hbias = 0.0f;
        parse_coefs(
            "0,0,1,0,0,"
            "0,1,1,1,0,"
            "1,1,1,1,1,"
            "0,1,1,1,0,"
            "0,0,1,0,0,",
            hfilt_width,hcoefs);
    }


    char* image_path = sdkFindFilePath(image_filename, argv[0]);
    if (image_path == NULL) {
        printf("Unable to source image file: %s
", image_filename);
        exit(-1);
    }
    // Load image from disk
    sdkLoadPGM(image_path, &h_inimg, &width, &height);
    h_outimg = (unsigned char*)malloc(width * height);

    printf("Starting convolution
");
    convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared);

    printf("Validating...
");
    h_refimg = (unsigned char*)malloc(width * height);
    convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs);
    int err_cnt = 0;
    for (int r=0; r 4)
                {
                    printf("Terminating...
");
                    exit(1);
                }
            }
    if (0 == err_cnt)
        printf("OK
");

    // Save image
    sdkSavePGM(out_filename,h_outimg,width,height);

    free(h_inimg);
    free(h_outimg);
}

if i put line 191 into comments everything runs fine and dandy (with no data in kernel).

can anyone please point the proper way to deliver the data into the kernel?

Robert Crovella · Accepted Answer

First of all, you are doing an inadequate job of proper cuda error checking. You should check the return value of every CUDA API call.

If you had done so, you would have found that the "invalid argument" error has nothing to do with your kernel launch, but since that is the only place you are checking for errors, it is getting reported there.

The actual error is occurring on these lines:

cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);

and you can fix it by adding the necessary ampersands:

cudaMalloc((void **) &d_in, imgSize);
cudaMalloc((void **) &d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);

Once you fix that error, you will discover that your next error is a seg fault on a cudaMemcpy operation:

cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);

The root cause is here:

int imgSize=sizeof(float)*width*height;
            ^^^^^^^^^^^^^

Since your d_in is unsigned char and your input_img is unsigned char, I'm not sure why you think you should multiply the image size by sizeof(float). Anyway, changing that line to this:

int imgSize=width*height;

will fix the seg fault. Making those changes allows your code to run without any CUDA errors for me. Obviously the results are bogus since your kernels do nothing.

Kernel launch failed: invalid argument ,cuda runtime error

Answers (1)

Related Questions