Reputation: 85
Currently working on C++ implementation of ToGreyscale method and I want to ask what is the most efficient way to transform "unsigned char* source" using custom RGB input params.
Below is a current idea, but maybe using a Vector would be better?
uint8_t* pixel = source;
for (int i = 0; i < sourceInfo.height; ++i) {
for (int j = 0; j < sourceInfo.width; ++j, pixel += pixelSize) {
float r = pixel[0];
float g = pixel[1];
float b = pixel[2];
// Do something with r, g, b
}
}
Upvotes: 1
Views: 1586
Reputation: 32084
The most efficient single threaded CPU implementation, is using manually optimized SIMD implementation.
SIMD extensions are specific for processor architecture.
For x86 there are SSE and AVX extensions, NEON for ARM, AltiVec for PowerPC...
In many cases the compiler is able to generate very efficient code that utilize the SIMD extension without any knowledge of the programmer (just by setting compiler flags).
There are also many cases where the compiler can't generate efficient code (many reasons for that).
When you need to get very high performance, it's recommended to implement it using C intrinsic functions.
Most of intrinsic instructions are converted directly to assembly instructions (instruction to instruction), without the need to know assembly.
There are many downsides of using intrinsic (compared to generic C implementation): Implementation is complicated to code and to maintain, and the code is platform specific and not portable.
A good reference for x86 intrinsics is Intel Intrinsics Guide.
The posted code uses SSE instruction set extension.
The implementation is very efficient, but not the top performance (using AVX2 for example may be faster, but less portable).
For better efficiency my code uses fixed point implementation.
In many cases fixed point is more efficient than floating point (but more difficult).
The most complicated part of the specific algorithm is reordering the RGB elements.
When RGB elements are ordered in triples r,g,b,r,g,b,r,g,b... you need to reorder them to rrrr... gggg... bbbb... in order of utilizing SIMD.
Naming conventions:
Don't be scared by the long weird variable names.
I am using this weird naming convention (it's my convention), because it helps me follow the code.
r7_r6_r5_r4_r3_r2_r1_r0
for example marks an XMM register with 8 uint16
elements.
The following implementation includes code with and without SSE intrinsics:
//Optimized implementation (use SSE intrinsics):
//----------------------------------------------
#include <intrin.h>
//Convert from RGBRGBRGB... to RRR..., GGG..., BBB...
//Input: Two XMM registers (24 uint8 elements) ordered RGBRGB...
//Output: Three XMM registers ordered RRR..., GGG... and BBB...
// Unpack the result from uint8 elements to uint16 elements.
static __inline void GatherRGBx8(const __m128i r5_b4_g4_r4_b3_g3_r3_b2_g2_r2_b1_g1_r1_b0_g0_r0,
const __m128i b7_g7_r7_b6_g6_r6_b5_g5,
__m128i &r7_r6_r5_r4_r3_r2_r1_r0,
__m128i &g7_g6_g5_g4_g3_g2_g1_g0,
__m128i &b7_b6_b5_b4_b3_b2_b1_b0)
{
//Shuffle mask for gathering 4 R elements, 4 G elements and 4 B elements (also set last 4 elements to duplication of first 4 elements).
const __m128i shuffle_mask = _mm_set_epi8(9,6,3,0, 11,8,5,2, 10,7,4,1, 9,6,3,0);
__m128i b7_g7_r7_b6_g6_r6_b5_g5_r5_b4_g4_r4 = _mm_alignr_epi8(b7_g7_r7_b6_g6_r6_b5_g5, r5_b4_g4_r4_b3_g3_r3_b2_g2_r2_b1_g1_r1_b0_g0_r0, 12);
//Gather 4 R elements, 4 G elements and 4 B elements.
//Remark: As I recall _mm_shuffle_epi8 instruction is not so efficient (I think execution is about 5 times longer than other shuffle instructions).
__m128i r3_r2_r1_r0_b3_b2_b1_b0_g3_g2_g1_g0_r3_r2_r1_r0 = _mm_shuffle_epi8(r5_b4_g4_r4_b3_g3_r3_b2_g2_r2_b1_g1_r1_b0_g0_r0, shuffle_mask);
__m128i r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4_r7_r6_r5_r4 = _mm_shuffle_epi8(b7_g7_r7_b6_g6_r6_b5_g5_r5_b4_g4_r4, shuffle_mask);
//Put 8 R elements in lower part.
__m128i b7_b6_b5_b4_g7_g6_g5_g4_r7_r6_r5_r4_r3_r2_r1_r0 = _mm_alignr_epi8(r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4_r7_r6_r5_r4, r3_r2_r1_r0_b3_b2_b1_b0_g3_g2_g1_g0_r3_r2_r1_r0, 12);
//Put 8 G elements in lower part.
__m128i g3_g2_g1_g0_r3_r2_r1_r0_zz_zz_zz_zz_zz_zz_zz_zz = _mm_slli_si128(r3_r2_r1_r0_b3_b2_b1_b0_g3_g2_g1_g0_r3_r2_r1_r0, 8);
__m128i zz_zz_zz_zz_r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4 = _mm_srli_si128(r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4_r7_r6_r5_r4, 4);
__m128i r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4_g3_g2_g1_g0 = _mm_alignr_epi8(zz_zz_zz_zz_r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4, g3_g2_g1_g0_r3_r2_r1_r0_zz_zz_zz_zz_zz_zz_zz_zz, 12);
//Put 8 B elements in lower part.
__m128i b3_b2_b1_b0_g3_g2_g1_g0_r3_r2_r1_r0_zz_zz_zz_zz = _mm_slli_si128(r3_r2_r1_r0_b3_b2_b1_b0_g3_g2_g1_g0_r3_r2_r1_r0, 4);
__m128i zz_zz_zz_zz_zz_zz_zz_zz_r7_r6_r5_r4_b7_b6_b5_b4 = _mm_srli_si128(r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4_r7_r6_r5_r4, 8);
__m128i zz_zz_zz_zz_r7_r6_r5_r4_b7_b6_b5_b4_b3_b2_b1_b0 = _mm_alignr_epi8(zz_zz_zz_zz_zz_zz_zz_zz_r7_r6_r5_r4_b7_b6_b5_b4, b3_b2_b1_b0_g3_g2_g1_g0_r3_r2_r1_r0_zz_zz_zz_zz, 12);
//Unpack uint8 elements to uint16 elements.
r7_r6_r5_r4_r3_r2_r1_r0 = _mm_cvtepu8_epi16(b7_b6_b5_b4_g7_g6_g5_g4_r7_r6_r5_r4_r3_r2_r1_r0);
g7_g6_g5_g4_g3_g2_g1_g0 = _mm_cvtepu8_epi16(r7_r6_r5_r4_b7_b6_b5_b4_g7_g6_g5_g4_g3_g2_g1_g0);
b7_b6_b5_b4_b3_b2_b1_b0 = _mm_cvtepu8_epi16(zz_zz_zz_zz_r7_r6_r5_r4_b7_b6_b5_b4_b3_b2_b1_b0);
}
//Calculate 8 Grayscale elements from 8 RGB elements.
//Y = 0.2989*R + 0.5870*G + 0.1140*B
//Conversion model used by MATLAB https://www.mathworks.com/help/matlab/ref/rgb2gray.html
static __inline __m128i Rgb2Yx8(__m128i r7_r6_r5_r4_r3_r2_r1_r0,
__m128i g7_g6_g5_g4_g3_g2_g1_g0,
__m128i b7_b6_b5_b4_b3_b2_b1_b0)
{
//Each coefficient is expanded by 2^15, and rounded to int16 (add 0.5 for rounding).
const __m128i r_coef = _mm_set1_epi16((short)(0.2989*32768.0 + 0.5)); //8 coefficients - R scale factor.
const __m128i g_coef = _mm_set1_epi16((short)(0.5870*32768.0 + 0.5)); //8 coefficients - G scale factor.
const __m128i b_coef = _mm_set1_epi16((short)(0.1140*32768.0 + 0.5)); //8 coefficients - B scale factor.
//Multiply input elements by 64 for improved accuracy.
r7_r6_r5_r4_r3_r2_r1_r0 = _mm_slli_epi16(r7_r6_r5_r4_r3_r2_r1_r0, 6);
g7_g6_g5_g4_g3_g2_g1_g0 = _mm_slli_epi16(g7_g6_g5_g4_g3_g2_g1_g0, 6);
b7_b6_b5_b4_b3_b2_b1_b0 = _mm_slli_epi16(b7_b6_b5_b4_b3_b2_b1_b0, 6);
//Use the special intrinsic _mm_mulhrs_epi16 that calculates round(r*r_coef/2^15).
//Calculate Y = 0.2989*R + 0.5870*G + 0.1140*B (use fixed point computations)
__m128i y7_y6_y5_y4_y3_y2_y1_y0 = _mm_add_epi16(_mm_add_epi16(
_mm_mulhrs_epi16(r7_r6_r5_r4_r3_r2_r1_r0, r_coef),
_mm_mulhrs_epi16(g7_g6_g5_g4_g3_g2_g1_g0, g_coef)),
_mm_mulhrs_epi16(b7_b6_b5_b4_b3_b2_b1_b0, b_coef));
//Divide result by 64.
y7_y6_y5_y4_y3_y2_y1_y0 = _mm_srli_epi16(y7_y6_y5_y4_y3_y2_y1_y0, 6);
return y7_y6_y5_y4_y3_y2_y1_y0;
}
//Convert single row from RGB to Grayscale (use SSE intrinsics).
//I0 points source row, and J0 points destination row.
//I0 -> rgbrgbrgbrgbrgbrgb...
//J0 -> yyyyyy
static void Rgb2GraySingleRow_useSSE(const unsigned char I0[],
const int image_width,
unsigned char J0[])
{
int x; //Index in J0.
int srcx; //Index in I0.
__m128i r7_r6_r5_r4_r3_r2_r1_r0;
__m128i g7_g6_g5_g4_g3_g2_g1_g0;
__m128i b7_b6_b5_b4_b3_b2_b1_b0;
srcx = 0;
//Process 8 pixels per iteration.
for (x = 0; x < image_width; x += 8)
{
//Load 8 elements of each color channel R,G,B from first row.
__m128i r5_b4_g4_r4_b3_g3_r3_b2_g2_r2_b1_g1_r1_b0_g0_r0 = _mm_loadu_si128((__m128i*)&I0[srcx]); //Unaligned load of 16 uint8 elements
__m128i b7_g7_r7_b6_g6_r6_b5_g5 = _mm_loadu_si128((__m128i*)&I0[srcx+16]); //Unaligned load of (only) 8 uint8 elements (lower half of XMM register).
//Separate RGB, and put together R elements, G elements and B elements (together in same XMM register).
//Result is also unpacked from uint8 to uint16 elements.
GatherRGBx8(r5_b4_g4_r4_b3_g3_r3_b2_g2_r2_b1_g1_r1_b0_g0_r0,
b7_g7_r7_b6_g6_r6_b5_g5,
r7_r6_r5_r4_r3_r2_r1_r0,
g7_g6_g5_g4_g3_g2_g1_g0,
b7_b6_b5_b4_b3_b2_b1_b0);
//Calculate 8 Y elements.
__m128i y7_y6_y5_y4_y3_y2_y1_y0 = Rgb2Yx8(r7_r6_r5_r4_r3_r2_r1_r0,
g7_g6_g5_g4_g3_g2_g1_g0,
b7_b6_b5_b4_b3_b2_b1_b0);
//Pack uint16 elements to 16 uint8 elements (put result in single XMM register). Only lower 8 uint8 elements are relevant.
__m128i j7_j6_j5_j4_j3_j2_j1_j0 = _mm_packus_epi16(y7_y6_y5_y4_y3_y2_y1_y0, y7_y6_y5_y4_y3_y2_y1_y0);
//Store 8 elements of Y in row Y0, and 8 elements of Y in row Y1.
_mm_storel_epi64((__m128i*)&J0[x], j7_j6_j5_j4_j3_j2_j1_j0);
srcx += 24; //Advance 24 source bytes per iteration.
}
}
//Convert image I from pixel ordered RGB to Grayscale format.
//Conversion formula: Y = 0.2989*R + 0.5870*G + 0.1140*B (Rec.ITU-R BT.601)
//Formula is based on MATLAB rgb2gray function: https://www.mathworks.com/help/matlab/ref/rgb2gray.html
//Implementation uses SSE intrinsics for performance optimization.
//Use fixed point computations for better performance.
//I - Input image in pixel ordered RGB format.
//image_width - Number of columns of I.
//image_height - Number of rows of I.
//J - Destination "image" in Grayscale format.
//I is pixel ordered RGB color format (size in bytes is image_width*image_height*3):
//RGBRGBRGBRGBRGBRGB
//RGBRGBRGBRGBRGBRGB
//RGBRGBRGBRGBRGBRGB
//RGBRGBRGBRGBRGBRGB
//
//J is in Grayscale format (size in bytes is image_width*image_height):
//YYYYYY
//YYYYYY
//YYYYYY
//YYYYYY
//
//Limitations:
//1. image_width must be a multiple of 8.
//2. I and J must be two separate arrays (in place computation is not supported).
//3. Rows of I and J are continues in memory (bytes stride is not supported, [but simple to add]).
//
//Comments:
//1. The conversion formula is incorrect, but it's a commonly used approximation.
//2. Code uses SSE 4.1 instruction set.
// Better performance can be archived using AVX2 implementation.
// (AVX2 is supported by Intel Core 4'th generation and above, and new AMD processors).
//3. The code is not the best SSE optimization:
// Uses unaligned load and store operations.
// Utilize only half XMM register in few cases.
// Instruction selection is probably sub-optimal.
void Rgb2Gray_useSSE(const unsigned char I[],
const int image_width,
const int image_height,
unsigned char J[])
{
//I0 points source image row.
const unsigned char *I0; //I0 -> rgbrgbrgbrgbrgbrgb...
//J0 points destination image row.
unsigned char *J0; //J0 -> YYYYYY
int y; //Row index
//Process one row per iteration.
for (y = 0; y < image_height; y ++)
{
I0 = &I[y*image_width*3]; //Input row width is image_width*3 bytes (each pixel is R,G,B).
J0 = &J[y*image_width]; //Output Y row width is image_width bytes (one Y element per pixel).
//Convert row I0 from RGB to Grayscale.
Rgb2GraySingleRow_useSSE(I0,
image_width,
J0);
}
}
//Convert single row from RGB to Grayscale (simple C code without intrinsics).
static void Rgb2GraySingleRow_Simple(const unsigned char I0[],
const int image_width,
unsigned char J0[])
{
int x; //index in J0.
int srcx; //Index in I0.
srcx = 0;
//Process 1 pixel per iteration.
for (x = 0; x < image_width; x++)
{
float r = (float)I0[srcx]; //Load red pixel and convert to float
float g = (float)I0[srcx+1]; //Green
float b = (float)I0[srcx+2]; //Blue
float gray = 0.2989f*r + 0.5870f*g + 0.1140f*b; //Convert to Grayscale (use BT.601 conversion coefficients).
J0[x] = (unsigned char)(gray + 0.5f); //Add 0.5 for rounding.
srcx += 3; //Advance 3 source bytes per iteration.
}
}
//Convert RGB to Grayscale using simple C code (without SIMD intrinsics).
//Use as reference (for time measurements).
void Rgb2Gray_Simple(const unsigned char I[],
const int image_width,
const int image_height,
unsigned char J[])
{
//I0 points source image row.
const unsigned char *I0; //I0 -> rgbrgbrgbrgbrgbrgb...
//J0 points destination image row.
unsigned char *J0; //J0 -> YYYYYY
int y; //Row index
//Process one row per iteration.
for (y = 0; y < image_height; y ++)
{
I0 = &I[y*image_width*3]; //Input row width is image_width*3 bytes (each pixel is R,G,B).
J0 = &J[y*image_width]; //Output Y row width is image_width bytes (one Y element per pixel).
//Convert row I0 from RGB to Grayscale.
Rgb2GraySingleRow_Simple(I0,
image_width,
J0);
}
}
In my machine, the manually optimized code is about 3 times faster.
Upvotes: 4
Reputation: 147
You can find medium value between RGB channels, then, assign the result to each channel.
uint8_t* pixel = source;
for (int i = 0; i < sourceInfo.height; ++i) {
for (int j = 0; j < sourceInfo.width; ++j, pixel += pixelSize) {
float grayscaleValue = 0;
for (int k = 0; k < 3; k++) {
grayscaleValue += pixel[k];
}
grayscaleValue /= 3;
for (int k = 0; k < 3; k++) {
pixel[k] = grayscaleValue;
}
}
}
Upvotes: 0