Reputation: 1757

Resize image using nearest neighborhood with cuda

I am implementing a nearest neighborhood kernel function to resize the input image. But the result is wrong and I have no idea.

Here is the input image

the result is wrong.

I use opencv to read the input image.

cv::Mat image = cv::imread("/home/tumh/test.jpg");
unsigned char* data = image.data;
int outH, outW;
float *out_data_host = test(data, image.rows, image.cols, outH, outW);
cv::Mat out_image(outH, outW, CV_32FC3);
memcpy(out_image.data, out_data_host, outH * outW * 3 * sizeof(float));

float* test(unsigned char* in_data_host, const int &inH, const int &inW, int &outH, int &outW) {
// get the output size
int im_size_min = std::min(inW, inH);
int im_size_max = std::max(inW, inH);

float scale_factor = static_cast<float>(640) / im_size_min;
float im_scale_x = std::floor(inW * scale_factor / 64) * 64 / inW;
float im_scale_y = std::floor(inH * scale_factor / 64) * 64 / inH;

outW = inW * im_scale_x;
outH = inH * im_scale_y;

int channel = 3;

unsigned char* in_data_dev;
CUDA_CHECK(cudaMalloc(&in_data_dev,  sizeof(unsigned char) * channel * inH * inW));
CUDA_CHECK(cudaMemcpy(in_data_dev, in_data_host, 1 * sizeof(unsigned char) * channel * inH * inW, cudaMemcpyHostToDevice));

// image pre process
const float2 scale = make_float2( im_scale_x, im_scale_y);
float * out_buffer = NULL;
CUDA_CHECK(cudaMalloc(&out_buffer,  sizeof(float) * channel * outH * outW));
float *out_data_host = new float[sizeof(float) * channel * outH * outW];
const dim3 threads(32, 32);
const dim3 block(iDivUp(outW, threads.x), iDivUp(outW, threads.y));
gpuPreImageNet<<<block, threads>>>(scale, in_data_dev, inW, out_buffer, outW, outH);
CUDA_CHECK(cudaFree(in_data_dev));
CUDA_CHECK(cudaMemcpy(out_data_host, out_buffer, sizeof(float) * channel * outH * outW, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(out_buffer));
return out_data_host;
}

Here is the resize kernel function

__global__ void gpuPreImageNet( float2 scale, unsigned char* input, int iWidth, float* output, int oWidth, int oHeight )
{
 const int x = blockIdx.x * blockDim.x + threadIdx.x;
 const int y = blockIdx.y * blockDim.y + threadIdx.y;
 const int n = oWidth * oHeight;
 int channel = 3;

 if( x >= oWidth || y >= oHeight )
   return;

 const int dx = ((float)x * scale.x);
 const int dy = ((float)y * scale.y);

 const unsigned char* px  = input +  dy * iWidth * channel + dx * channel ;

 const float3 bgr = make_float3(*(px + 0),  *(px + 1), *(px + 2));

 output[channel * y * oWidth + channel * x + 0] = bgr.x;
 output[channel * y * oWidth + channel * x + 1] = bgr.y;
 output[channel * y * oWidth + channel * x + 2] = bgr.z;
}

Most of the implementation is from https://github.com/soulsheng/ResizeNN/blob/master/resizeCUDA/resizeNN.cu

Any idea?

Upvotes: 0

Answers (2)

Naman

Reputation: 54

It took me around two days to figure out a solution for this problem. Basically, I was building a GPU based image preprocessing pipeline for my project. Here's the custom Cuda Kernel. For Gray scale Image Resizing, change channel from 3 -> 1 and it should work.

__global__ void resize_kernel( real* pIn, real* pOut, int widthIn, int heightIn, int widthOut, int heightOut)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    int channel = 3;

    if( i < heightOut && j < widthOut )
    {
        int iIn = i * heightIn / heightOut;
        int jIn = j * widthIn / widthOut;
        for(int c = 0; c < channel; c++)
            pOut[ (i*widthOut + j)*channel + c ] = pIn[ (iIn*widthIn + jIn)*channel + c ];
    }
}

Upvotes: 2

X3liF

Reputation: 1074

Maybe you are observing an uninitialized memory problem.

As i understand your code, out_data_host allocation is too big

new float[sizeof(float) * channel * outH * outW];

should be

new float[channel * outH * outW]

Then out_buffer is uninitialized, add a cudaMemset after the cudaMalloc line.

To clarify your code, since you already use OpenCV to load images, why don't you use opencv to resize your images ?

cv::resize // Host side method is probably better since you'll have less data copied through PCI-Express
// or
cv::cuda::resize

Upvotes: 3

Resize image using nearest neighborhood with cuda

Answers (2)

Related Questions