Reputation: 5019
template<typename T>
cv::Mat_<T> const bilinear_interpolation(cv::Mat_<T> const &src, cv::Size dsize,
float dx, float dy)
{
cv::Mat_<T> dst = dsize.area() == 0 ? cv::Mat_<T>(src.rows * dy, src.cols * dx) :
cv::Mat_<T>(dsize);
float const x_ratio = static_cast<float>((src.cols - 1)) / dst.cols;
float const y_ratio = static_cast<float>((src.rows - 1)) / dst.rows;
for(int row = 0; row != dst.rows; ++row)
{
int y = static_cast<int>(row * y_ratio);
float const y_diff = (row * y_ratio) - y; //distance of the nearest pixel(y axis)
float const y_diff_2 = 1 - y_diff;
auto *dst_ptr = &dst(row, 0)[0];
for(int col = 0; col != dst.cols; ++col)
{
int x = static_cast<int>(col * x_ratio);
float const x_diff = (col * x_ratio) - x; //distance of the nearest pixel(x axis)
float const x_diff_2 = 1 - x_diff;
float const y2_cross_x2 = y_diff_2 * x_diff_2;
float const y2_cross_x = y_diff_2 * x_diff;
float const y_cross_x2 = y_diff * x_diff_2;
float const y_cross_x = y_diff * x_diff;
for(int channel = 0; channel != cv::DataType<T>::channels; ++channel)
{
*dst_ptr++ = y2_cross_x2 * src(y, x)[channel] +
y2_cross_x * src(y, x + 1)[channel] +
y_cross_x2 * src(y + 1, x)[channel] +
y_cross_x * src(y + 1, x + 1)[channel];
}
}
}
return dst;
}
This is an implementation of bilinear interpolation, I use it to enlarge a 512 * 512 image ("lena.png") to 2048 * 2048. It takes me 0.195 secs to finish the job, but cv::resize (not the GPU version) of OpenCV only takes 0.026 secs. I don't know what makes my program so slow (OpenCV is faster than me by almost 750%), I would like to see the source code of the resize of OpenCV but I can't find the implementation of it.
Do you have any idea why the resize of OpenCV could be so fast or my bilinear is too slow?
{
timeEstimate<> time;
cv::Mat_<cv::Vec3b> const src = input;
bilinear_interpolation(src, cv::Size(), dx, dy);
std::cout << "bilinear" << std::endl;
}
{
timeEstimate<> time;
cv::Mat output = input.clone();
cv::resize(input, output, cv::Size(), dx, dy, cv::INTER_LINEAR);
std::cout << "bilinear cv" << std::endl;
}
compiler : mingw4.6.2 os : win7 64bits cpu : Intel® i3-2330M (2.2G)
Upvotes: 6
Views: 4790
Reputation: 4808
I had the same question when adding bilinear upscaling to some CPU based graphics code recently.
Firstly I ran your code with the following config:
OS: Xubuntu 20 in a VM Compiler: gcc 9.3.0 OpenCV version: 4.2.0 CPU: i3-6100u (2.3 GHz) Source bitmap size: 512x512 Destination bitmap size: 2048x2048
I found your code took 92ms while OpenCV took 4.2ms. So the difference is even greater now than it was when you asked the question in 2012. I guess OpenCV optimized even more since then.
(At this point I switched to using Visual Studio 2013 in Windows, building for the x64 target).
Converting the code to use fixed point arithmetic reduced the time to 30ms. Fixed point arithmetic is helpful because keeps the data as integers. The input and output data are integers. Having to convert them to float and back again is costly. I expect the speed up would have been even more if I'd stuck with GCC 9.3, since I generally find it generates faster code than VS 2013. Anyway, here's the code:
typedef union {
unsigned c;
struct { unsigned char b, g, r, a; };
} DfColour;
typedef struct _DfBitmap {
int width, height;
DfColour *pixels;
} DfBitmap;
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
// For every output pixel...
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < dstW; x++, dstPixel++) {
// Perform bilinear interpolation on 2x2 src pixels.
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned r = 0, g = 0, b = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &srcRow[srcX];
unsigned w = (weightX * weightY) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
// Pixel 1,0
srcPixel++;
w = (weightX2 * weightY) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
// Pixel 1,1
srcPixel += src->width;
w = (weightX2 * weightY2) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
// Pixel 0,1
srcPixel--;
w = (weightX * weightY2) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
dstPixel->r = r >> 8;
dstPixel->g = g >> 8;
dstPixel->b = b >> 8;
}
}
}
Switching to a better algorithm reduced the time to 19.5ms. As Andrey Kamaev's answer said, the better algorithm works by splitting the vertical and horizontal resizes into two separate passes. The destination bitmap is used as temporary storage space for the output of the first pass. The X traversal in the second pass is backwards to avoid overwriting the data it is about to need. Here's the code:
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++, dstPixel++) {
unsigned r = 0, g = 0, b = 0;
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
r += srcPixel->r * weightY;
g += srcPixel->g * weightY;
b += srcPixel->b * weightY;
// Pixel 1,0
srcPixel += src->width;
r += srcPixel->r * weightY2;
g += srcPixel->g * weightY2;
b += srcPixel->b * weightY2;
dstPixel->r = r >> 8;
dstPixel->g = g >> 8;
dstPixel->b = b >> 8;
}
}
for (int y = 0; y < dstH; y++) {
DfColour *dstRow = &dst->pixels[y * dst->width];
for (int x = dstW - 1; x; x--) {
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned r = 0, g = 0, b = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &dstRow[srcX];
r += srcPixel->r * weightX;
g += srcPixel->g * weightX;
b += srcPixel->b * weightX;
// Pixel 0,1
srcPixel++;
r += srcPixel->r * weightX2;
g += srcPixel->g * weightX2;
b += srcPixel->b * weightX2;
DfColour *dstPixel = &dstRow[x];
dstPixel->r = r >> 8;
dstPixel->g = g >> 8;
dstPixel->b = b >> 8;
}
}
}
Using a simple portable SIMD scheme reduced the time to 16.5ms. The SIMD scheme doesn't use proprietary instruction set extensions like SSE/AVX. Instead it uses a hack to allow the red and blue channels to be stored and operated on in a 32-bit integer. It isn't as fast as an AVX implementation, but it has the benefit of simplicity. Here's the code:
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++, dstPixel++) {
unsigned rb = 0, g = 0;
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
rb += (srcPixel->c & 0xff00ff) * weightY;
g += srcPixel->g * weightY;
// Pixel 1,0
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstPixel->c = rb >> 8;
dstPixel->g = g >> 8;
}
}
for (int y = 0; y < dstH; y++) {
DfColour *dstRow = &dst->pixels[y * dst->width];
for (int x = dstW - 1; x; x--) {
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned rb = 0, g = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &dstRow[srcX];
rb += (srcPixel->c & 0xff00ff) * weightX;
g += srcPixel->g * weightX;
// Pixel 0,1
srcPixel++;
rb += (srcPixel->c & 0xff00ff) * weightX2;
g += srcPixel->g * weightX2;
DfColour *dstPixel = &dstRow[x];
dstPixel->c = rb >> 8;
dstPixel->g = g >> 8;
}
}
}
It is possible to keep the X axis passes separate but combine the Y axis passes. This improves cache coherency and makes the code a little simpler. Recombining the two passes reduces the time to 14.6ms. Here's the code:
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstRow = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++) {
unsigned rb = 0, g = 0;
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
rb += (srcPixel->c & 0xff00ff) * weightY;
g += srcPixel->g * weightY;
// Pixel 1,0
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
for (int x = dstW - 1; x; x--) {
unsigned rb = 0, g = 0;
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &dstRow[srcX];
rb += (srcPixel->c & 0xff00ff) * weightX;
g += srcPixel->g * weightX;
// Pixel 0,1
srcPixel++;
rb += (srcPixel->c & 0xff00ff) * weightX2;
g += srcPixel->g * weightX2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
}
}
The second inner loop can be simplified with look-up tables since the values for srcX, weightX and weightX2 are the same for each scanline. Using look-up tables in the second inner loop reduces the runtime to 12.9ms. Here's the code:
struct SrcXandWeights {
uint8_t weightX, weightX2;
uint16_t srcX;
};
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
// Allocate look-up table.
static SrcXandWeights *lut = NULL;
static int lutSize = 0;
if (lutSize < dstW) {
delete [] lut;
lut = new SrcXandWeights [dstW];
lutSize = dstW;
}
// Populate look-up table.
for (int x = 0; x < dstW; x++) {
int srcXAndWeight = (x * widthRatio) >> 8;
lut[x].srcX = srcXAndWeight >> 8;
lut[x].weightX2 = srcXAndWeight & 0xFF;
lut[x].weightX = 255 - lut[x].weightX2;
}
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = (srcYAndWeight) >> 8;
DfColour *dstRow = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++) {
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
unsigned rb = (srcPixel->c & 0xff00ff) * weightY;
unsigned g = srcPixel->g * weightY;
// Pixel 1,0
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
for (int x = dstW - 1; x; x--) {
SrcXandWeights *sw = lut + x;
// Pixel 0,0
DfColour *srcPixel = &dstRow[sw->srcX];
unsigned rb = (srcPixel->c & 0xff00ff) * sw->weightX;
unsigned g = srcPixel->g * sw->weightX;
// Pixel 0,1
srcPixel++;
rb += (srcPixel->c & 0xff00ff) * sw->weightX2;
g += srcPixel->g * sw->weightX2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
}
}
At this point the code is still single threaded. My CPU has two physical cores and 4 threads in total. OpenCV uses 2 threads on my machine. I expect converting the code to use 2 threads would reduce the time to about 7ms.
I don't know what other tricks are needed to get down to 4ms, although converting to a real AVX SIMD implementation is probably necessary.
Upvotes: 3
Reputation: 67
Perhaps a little late, but also check to see if you are running your application in debug mode. OpenCV is a library and is likely to be compiled for release - with compiler optimizations.
Upvotes: 0
Reputation: 30122
There are two main things making OpenCV's version faster:
OpenCV implements resize as a "separable operation". I.e. it is done in two steps: image is stretched horizontally and then vertically. This technique allows to make resize using less arithmetic operations.
Hand-coded SSE optimization.
Upvotes: 5