Reputation: 233
In C#, I'm storing RGB image data in a byte[] array ([r, g, b, r, g, b, ...]) and am attempting to convert it to grayscale. I'm implementing this grayscale conversion both in C# (using pointers) and in C++ (using SIMD instructions and P/Invoke) to compare performance gains when using C++ in C#.
The C# code works correctly and saves the image without issues, but when I use the C++ version, the saved grayscale image appears as random noise. Here is my main C# code:
static void Main(string[] args)
{
DllLoader.LoadLibrary("ImageProcessingLib.dll");
double totalElapsedMicrosecondsCpp = 0;
double totalElapsedMicrosecondsCS = 0;
// Load your image
Bitmap bitmap = new Bitmap("nature.jpeg");
// Convert the image to byte array
byte[] rgbBytes = ConvertBitmapToByteArray(bitmap);
byte[] rgbBytesCpp = ConvertBitmapToByteArray(bitmap);
int runs = 2;
for (int i = 0; i < runs; i++)
{
Stopwatch sw = Stopwatch.StartNew();
// Call the P/Invoke function for C++ implementation
fixed (byte* ptr = rgbBytesCpp)
{
DllLoader.ConvertRgbToGrayscale(ptr, rgbBytesCpp.Length);
}
sw.Stop();
totalElapsedMicrosecondsCpp += sw.Elapsed.TotalMilliseconds * 1000;
}
for (int i = 0; i < runs; i++)
{
Stopwatch sw = Stopwatch.StartNew();
// C# grayscale function
ConvertRgbToGrayscale(rgbBytes);
sw.Stop();
totalElapsedMicrosecondsCS += sw.Elapsed.TotalMilliseconds * 1000;
}
double averageElapsedMicrosecondsPInvoke = totalElapsedMicrosecondsCpp / runs;
double averageElapsedMicrosecondsCSharp = totalElapsedMicrosecondsCS / runs;
Console.WriteLine("Average P/Invoke Grayscale Time: {0} microseconds", averageElapsedMicrosecondsPInvoke);
Console.WriteLine("Average Native C# Grayscale Time: {0} microseconds", averageElapsedMicrosecondsCSharp);
SaveGrayscaleImage(rgbBytesCpp, bitmap.Width, bitmap.Height, "Cpp.jpg");
SaveGrayscaleImage(rgbBytes, bitmap.Width, bitmap.Height, "C#.jpg");
Console.ReadLine();
}
public unsafe class DllLoader
{
// Static constructor to load the DLL without invoking any functions from it
static DllLoader()
{
LoadLibrary("ImageProcessingLib.dll");
}
[DllImport("kernel32.dll", CharSet = CharSet.Auto)]
public static extern IntPtr LoadLibrary(string lpFileName);
// P/Invoke to call the C++ ConvertRgbToGrayscale function
[DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
public static extern byte* ConvertRgbToGrayscale(byte* pImage, int length);
}
I used both SIMD and non-SIMD approaches in my C++ function, but the SIMD approach causes memory issues. Here’s the SIMD code:
#include <immintrin.h>
#include <cstdint>
extern "C" __declspec(dllexport) void ConvertRgbToGrayscaleSIMD(uint8_t* rgbArray, size_t length) {
// Ensure the array is aligned to 32-byte boundary (for AVX)
//__m256i* alignedArray = reinterpret_cast<__m256i*>(_aligned_malloc(length, 32));
// Copy data to aligned memory
//memcpy(alignedArray, rgbArray, length);
// Grayscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
const uint8_t coeffR = 77; // 0.3 * 256 ≈ 77
const uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
const uint8_t coeffB = 29; // 0.11 * 256 ≈ 29
// Load the grayscale coefficients into AVX registers (broadcast to 8 elements)
__m256i coeff_r = _mm256_set1_epi8(coeffR);
__m256i coeff_g = _mm256_set1_epi8(coeffG);
__m256i coeff_b = _mm256_set1_epi8(coeffB);
size_t i = 0;
// Process 8 pixels (24 bytes) at once
for (; i + 23 < length; i += 24) { // 8 pixels (24 bytes) per loop
// Load 24 bytes (8 pixels, RGBRGBRGB...)
__m256i rgb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(rgbArray + i));
// Extract the R, G, B channels
__m256i r = _mm256_and_si256(rgb1, _mm256_set1_epi8(0xFF)); // R channel (bytes 0, 3, 6, 9, 12, 15, 18, 21)
__m256i g = _mm256_and_si256(_mm256_srli_epi32(rgb1, 8), _mm256_set1_epi8(0xFF)); // G channel (bytes 1, 4, 7, 10, 13, 16, 19, 22)
__m256i b = _mm256_and_si256(_mm256_srli_epi32(rgb1, 16), _mm256_set1_epi8(0xFF)); // B channel (bytes 2, 5, 8, 11, 14, 17, 20, 23)
// Calculate grayscale
__m256i gray_r = _mm256_mullo_epi16(r, coeff_r); // R * coeffR
__m256i gray_g = _mm256_mullo_epi16(g, coeff_g); // G * coeffG
__m256i gray_b = _mm256_mullo_epi16(b, coeff_b); // B * coeffB
// Add the values (R * coeffR + G * coeffG + B * coeffB)
__m256i gray = _mm256_add_epi8(
_mm256_add_epi8(gray_r, gray_g),
gray_b
);
// Right shift by 8 to normalize the grayscale values
gray = _mm256_srli_epi16(gray, 8);
// Duplicate grayscale values to R, G, B channels
__m256i gray_rgb = _mm256_packus_epi16(gray, gray);
// Store the resulting grayscale values back into the rgbArray
_mm256_storeu_si256(reinterpret_cast<__m256i*>(rgbArray + i), gray_rgb);
}
// Handle any leftover pixels that don't fit into full 8-pixel chunks
for (; i + 2 < length; i += 3) {
uint8_t r = rgbArray[i];
uint8_t g = rgbArray[i + 1];
uint8_t b = rgbArray[i + 2];
uint8_t gray = static_cast<uint8_t>((coeffR * r + coeffG * g + coeffB * b) >> 8);
rgbArray[i] = gray;
rgbArray[i + 1] = gray;
rgbArray[i + 2] = gray;
}
// Handle any leftover pixels that don't fit into full RGB triplets (i.e., length % 3 != 0)
size_t remainder = length % 3;
if (remainder > 0) {
for (size_t j = length - remainder; j < length; ++j) {
rgbArray[j] = rgbArray[j]; // No change
}
}
//memcpy(rgbArray, alignedArray, length);
//_aligned_free(alignedArray);
}
When I uncomment the aligned memory lines (_aligned_malloc and memcpy), the output image is correct, but it significantly slows down performance. I’d like to avoid this memory alignment while still using SIMD for better performance.
I am on .net framework 4.8 and my current performance results:
4k image RGB to grayscale conversion
C#: 18 ms (Working)
C++ P/Invoke Non SIMD : 13 ms (Working)
C++ P/Invoke SIMD : 7 ms (Random Noise Problem)
Question: Is there a way to perform SIMD grayscale conversion on this byte[] without needing aligned memory? Or, is there another efficient way to handle this that avoids the noise issue while maintaining performance?
Upvotes: 1
Views: 207
Reputation: 321
This is not a memory alignment problem, but an error in the SIMD algorithm.
For 24-bit to 8-bit grayscale conversion, this approach can be used: read 3 vectors at a time from the source bitmap, perform a 3-element group deinterleave operation, and obtain the R,G,B plane data. Subsequently, vectorized multiplication and addition are used to calculate the grayscale values. Finally, the vector that stores the grayscale values is interleaved with 3-element groups to be stored in the destination bitmap.
For example, the SSE instruction set uses 128-bit vector, where 1 vector is 16 bytes. Reading 3 vectors at a time from the source bitmap is reading 48 bytes, or 16 RGB pixels.
When running on my computer (AMD Ryzen 7 7840H), for a 4096x4096 bitmap, the scalar algorithm takes 17ms, while this vector algorithm takes only 3ms.
For deinterleaving 3-element groups, this can be done using instructions of the shuffle category. For example, for a 128 bit vector in the X86 architecture, you can use the _mm_shuffle_epi8 instruction in SSSE3, which corresponds to the Ssse3.Shuffle
method in .NET. The source code is as follows.
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_X_Part0 = Vector128.Create((sbyte)0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_X_Part1 = Vector128.Create((sbyte)-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_X_Part2 = Vector128.Create((sbyte)-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_Y_Part0 = Vector128.Create((sbyte)1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_Y_Part1 = Vector128.Create((sbyte)-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_Y_Part2 = Vector128.Create((sbyte)-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_Z_Part0 = Vector128.Create((sbyte)2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_Z_Part1 = Vector128.Create((sbyte)-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1).AsByte();
static readonly Vector128<byte> YGroup3Unzip_Shuffle_Byte_Z_Part2 = Vector128.Create((sbyte)-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15).AsByte();
public static Vector128<byte> YGroup3Unzip(Vector128<byte> data0, Vector128<byte> data1, Vector128<byte> data2, out Vector128<byte> y, out Vector128<byte> z) {
var f0A = YGroup3Unzip_Shuffle_Byte_X_Part0;
var f0B = YGroup3Unzip_Shuffle_Byte_X_Part1;
var f0C = YGroup3Unzip_Shuffle_Byte_X_Part2;
var f1A = YGroup3Unzip_Shuffle_Byte_Y_Part0;
var f1B = YGroup3Unzip_Shuffle_Byte_Y_Part1;
var f1C = YGroup3Unzip_Shuffle_Byte_Y_Part2;
var f2A = YGroup3Unzip_Shuffle_Byte_Z_Part0;
var f2B = YGroup3Unzip_Shuffle_Byte_Z_Part1;
var f2C = YGroup3Unzip_Shuffle_Byte_Z_Part2;
var rt0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(data0, f0A), Ssse3.Shuffle(data1, f0B)), Ssse3.Shuffle(data2, f0C));
var rt1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(data0, f1A), Ssse3.Shuffle(data1, f1B)), Ssse3.Shuffle(data2, f1C));
var rt2 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(data0, f2A), Ssse3.Shuffle(data1, f2B)), Ssse3.Shuffle(data2, f2C));
y = rt1;
z = rt2;
return rt0;
}
To make it easier to write vector algorithms across platforms, I have developed the VectorTraits library, which already integrates the above algorithms. The library provides the method Vectors.YGroup3Unzip method. The method is cross-platform and uses the shuffle instructions of each platform.
_mm256_shuffle_epi8
and other instructions.vqvtbl1q_u8
instructions.i8x16.swizzle
instructions.This library also provides an inverse operation: the YGroup3Zip method. It can interleave R,G,B planar data into packed RGB pixel data.
With the YGroup3Unzip and YGroup3Zip methods, it is easy to write algorithm for converting color to grayscale. The grayscale coefficients have 8-bit precision, so you need to widen the 8-bit data to 16-bit and then calculate the multiplication and addition. Finally, narrow the 16 bit data to 8 bits. The source code is as follows.
public static unsafe void UseVectorsDoBatch(byte* pSrc, int strideSrc, int width, int height, byte* pDst, int strideDst) {
const int cbPixel = 3; // Bgr24
const int shiftPoint = 8;
const int mulPoint = 1 << shiftPoint; // 0x100
const ushort mulRed = (ushort)(0.299 * mulPoint + 0.5); // 77
const ushort mulGreen = (ushort)(0.587 * mulPoint + 0.5); // 150
const ushort mulBlue = mulPoint - mulRed - mulGreen; // 29
Vector<ushort> vmulRed = new Vector<ushort>(mulRed);
Vector<ushort> vmulGreen = new Vector<ushort>(mulGreen);
Vector<ushort> vmulBlue = new Vector<ushort>(mulBlue);
int vectorWidth = Vector<byte>.Count;
int maxX = width - vectorWidth;
byte* pRow = pSrc;
byte* qRow = pDst;
for (int i = 0; i < height; i++) {
Vector<byte>* pLast = (Vector<byte>*)(pRow + maxX * cbPixel); // Bgr24
Vector<byte>* qLast = (Vector<byte>*)(qRow + maxX * cbPixel); // Bgr24 store grayscale.
Vector<byte>* p = (Vector<byte>*)pRow;
Vector<byte>* q = (Vector<byte>*)qRow;
for (; ; ) {
Vector<byte> r, g, b, gray, gray0, gray1, gray2;
Vector<ushort> wr0, wr1, wg0, wg1, wb0, wb1;
// Load.
b = Vectors.YGroup3Unzip(p[0], p[1], p[2], out g, out r);
// widen(r) * mulRed + widen(g) * mulGreen + widen(b) * mulBlue
Vector.Widen(r, out wr0, out wr1);
Vector.Widen(g, out wg0, out wg1);
Vector.Widen(b, out wb0, out wb1);
wr0 = Vectors.Multiply(wr0, vmulRed);
wr1 = Vectors.Multiply(wr1, vmulRed);
wg0 = Vectors.Multiply(wg0, vmulGreen);
wg1 = Vectors.Multiply(wg1, vmulGreen);
wb0 = Vectors.Multiply(wb0, vmulBlue);
wb1 = Vectors.Multiply(wb1, vmulBlue);
wr0 = Vector.Add(wr0, wg0);
wr1 = Vector.Add(wr1, wg1);
wr0 = Vector.Add(wr0, wb0);
wr1 = Vector.Add(wr1, wb1);
// Shift right and narrow.
wr0 = Vectors.ShiftRightLogical_Const(wr0, shiftPoint);
wr1 = Vectors.ShiftRightLogical_Const(wr1, shiftPoint);
gray = Vector.Narrow(wr0, wr1);
// Store.
gray0 = Vectors.YGroup3Zip(gray, gray, gray, out gray1, out gray2);
q[0] = gray0;
q[1] = gray1;
q[2] = gray2;
// Next.
if (p >= pLast) break;
p += cbPixel;
q += cbPixel;
if (p > pLast) p = pLast; // The last block is also use vector.
if (q > qLast) q = qLast;
}
pRow += strideSrc;
qRow += strideDst;
}
}
The Vectors.ShiftRightLogical_Const
in the source code above is a method provided by the VectorTraits library. It replaces the Vector.ShiftRightLogical
method that was new in .NET 7.0
, and allows earlier versions of .NET to use logical right shift.
The Vectors.Multiply
is also a method provided by the VectorTraits library. It avoids the problem that unsigned types are sometimes no hardware accelerated.
Then write benchmark code for the algorithm.
[Benchmark]
public void UseVectors() {
UseVectorsDo(_sourceBitmapData, _destinationBitmapData, 0);
}
[Benchmark]
public void UseVectorsParallel() {
UseVectorsDo(_sourceBitmapData, _destinationBitmapData, 1);
}
public static unsafe void UseVectorsDo(BitmapData src, BitmapData dst, int parallelFactor = 0) {
int vectorWidth = Vector<byte>.Count;
int width = src.Width;
int height = src.Height;
if (width <= vectorWidth) {
ScalarDo(src, dst, parallelFactor);
return;
}
int strideSrc = src.Stride;
int strideDst = dst.Stride;
byte* pSrc = (byte*)src.Scan0.ToPointer();
byte* pDst = (byte*)dst.Scan0.ToPointer();
int processorCount = Environment.ProcessorCount;
int batchSize = 0;
if (parallelFactor > 1) {
batchSize = height / (processorCount * parallelFactor);
} else if (parallelFactor == 1) {
if (height >= processorCount) batchSize = 1;
}
bool allowParallel = (batchSize > 0) && (processorCount > 1);
if (allowParallel) {
int batchCount = (height + batchSize - 1) / batchSize; // ceil((double)length / batchSize)
Parallel.For(0, batchCount, i => {
int start = batchSize * i;
int len = batchSize;
if (start + len > height) len = height - start;
byte* pSrc2 = pSrc + start * strideSrc;
byte* pDst2 = pDst + start * strideDst;
UseVectorsDoBatch(pSrc2, strideSrc, width, len, pDst2, strideDst);
});
} else {
UseVectorsDoBatch(pSrc, strideSrc, width, height, pDst, strideDst);
}
}
The results of the benchmark on the X86 architecture are as follows.
BenchmarkDotNet v0.14.0, Windows 11 (10.0.22631.4460/23H2/2023Update/SunValley3)
AMD Ryzen 7 7840H w/ Radeon 780M Graphics, 1 CPU, 16 logical and 8 physical cores
.NET SDK 8.0.403
[Host] : .NET 8.0.10 (8.0.1024.46610), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
DefaultJob : .NET 8.0.10 (8.0.1024.46610), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
| Method | Width | Mean | Error | StdDev | Ratio |
|--------------------- |------ |-------------:|-----------:|-----------:|------:|
| Scalar | 1024 | 1,128.81 us | 4.436 us | 3.932 us | 1.00 |
| ScalarParallel | 1024 | 157.96 us | 1.007 us | 0.942 us | 0.14 |
| UseVectors | 1024 | 123.79 us | 1.144 us | 1.014 us | 0.11 |
| UseVectorsParallel | 1024 | 26.05 us | 0.503 us | 0.471 us | 0.02 |
| | | | | | |
| Scalar | 2048 | 4,279.99 us | 37.658 us | 35.226 us | 1.00 |
| ScalarParallel | 2048 | 622.01 us | 3.989 us | 3.537 us | 0.15 |
| UseVectors | 2048 | 631.53 us | 6.741 us | 6.305 us | 0.15 |
| UseVectorsParallel | 2048 | 330.47 us | 5.479 us | 4.857 us | 0.08 |
| | | | | | |
| Scalar | 4096 | 17,252.90 us | 106.215 us | 99.353 us | 1.00 |
| ScalarParallel | 4096 | 3,743.78 us | 25.989 us | 24.310 us | 0.22 |
| UseVectors | 4096 | 3,273.92 us | 32.645 us | 30.537 us | 0.19 |
| UseVectorsParallel | 4096 | 3,746.83 us | 11.083 us | 9.255 us | 0.22 |
The same source code can be run on the Arm architecture. The benchmark results are as follows.
BenchmarkDotNet v0.14.0, macOS Sequoia 15.0.1 (24A348) [Darwin 24.0.0]
Apple M2, 1 CPU, 8 logical and 8 physical cores
.NET SDK 8.0.204
[Host] : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
DefaultJob : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
| Method | Width | Mean | Error | StdDev | Median | Ratio | RatioSD |
|--------------------- |------ |-------------:|-----------:|-----------:|-------------:|------:|--------:|
| Scalar | 1024 | 719.32 us | 0.215 us | 0.201 us | 719.34 us | 1.00 | 0.00 |
| ScalarParallel | 1024 | 157.38 us | 1.423 us | 1.111 us | 157.25 us | 0.22 | 0.00 |
| UseVectors | 1024 | 169.25 us | 0.538 us | 0.503 us | 169.40 us | 0.24 | 0.00 |
| UseVectorsParallel | 1024 | 57.81 us | 0.998 us | 2.149 us | 58.11 us | 0.08 | 0.00 |
| | | | | | | | |
| Scalar | 2048 | 2,963.48 us | 6.674 us | 5.211 us | 2,961.39 us | 1.00 | 0.00 |
| ScalarParallel | 2048 | 627.47 us | 11.680 us | 25.142 us | 616.63 us | 0.21 | 0.01 |
| UseVectors | 2048 | 716.27 us | 2.097 us | 1.961 us | 717.02 us | 0.24 | 0.00 |
| UseVectorsParallel | 2048 | 368.49 us | 7.320 us | 21.469 us | 378.95 us | 0.12 | 0.01 |
| | | | | | | | |
| Scalar | 4096 | 12,449.32 us | 177.868 us | 157.676 us | 12,508.13 us | 1.00 | 0.02 |
| ScalarParallel | 4096 | 2,510.22 us | 34.541 us | 30.620 us | 2,501.37 us | 0.20 | 0.00 |
| UseVectors | 4096 | 2,968.72 us | 20.503 us | 18.175 us | 2,965.71 us | 0.24 | 0.00 |
| UseVectorsParallel | 4096 | 1,728.46 us | 4.362 us | 4.080 us | 1,729.00 us | 0.14 | 0.00 |
The same source code can also be run on the .NET Framework
. The benchmark results are as follows.
BenchmarkDotNet v0.14.0, Windows 11 (10.0.22631.4460/23H2/2023Update/SunValley3)
AMD Ryzen 7 7840H w/ Radeon 780M Graphics, 1 CPU, 16 logical and 8 physical cores
[Host] : .NET Framework 4.8.1 (4.8.9282.0), X64 RyuJIT VectorSize=256
DefaultJob : .NET Framework 4.8.1 (4.8.9282.0), X64 RyuJIT VectorSize=256
| Method | Width | Mean | Error | StdDev | Ratio | RatioSD | Code Size |
|--------------------- |------ |------------:|----------:|----------:|------:|--------:|----------:|
| Scalar | 1024 | 1,144.3 us | 6.87 us | 6.43 us | 1.00 | 0.01 | 2,813 B |
| ScalarParallel | 1024 | 188.0 us | 2.84 us | 2.65 us | 0.16 | 0.00 | 2,816 B |
| UseVectors | 1024 | 3,761.0 us | 44.63 us | 41.75 us | 3.29 | 0.04 | NA |
| UseVectorsParallel | 1024 | 510.2 us | 7.41 us | 6.93 us | 0.45 | 0.01 | NA |
| | | | | | | | |
| Scalar | 2048 | 4,572.6 us | 16.74 us | 14.84 us | 1.00 | 0.00 | 2,813 B |
| ScalarParallel | 2048 | 704.0 us | 8.79 us | 8.22 us | 0.15 | 0.00 | 2,816 B |
| UseVectors | 2048 | 14,765.7 us | 168.90 us | 157.99 us | 3.23 | 0.03 | NA |
| UseVectorsParallel | 2048 | 1,946.6 us | 38.41 us | 39.44 us | 0.43 | 0.01 | NA |
| | | | | | | | |
| Scalar | 4096 | 18,254.0 us | 122.53 us | 114.61 us | 1.00 | 0.01 | 2,813 B |
| ScalarParallel | 4096 | 3,726.5 us | 25.17 us | 23.54 us | 0.20 | 0.00 | 2,816 B |
| UseVectors | 4096 | 59,189.0 us | 931.28 us | 871.12 us | 3.24 | 0.05 | NA |
| UseVectorsParallel | 4096 | 7,127.7 us | 138.79 us | 136.31 us | 0.39 | 0.01 | NA |
Due to the lack of intrinsic functions in the .NET Framework
, some functions can only be fall back to scalar algorithms. Its performance is inferior to scalar algorithms implemented with pointers
If you can only use .NET Framework
, maybe Parallel is better for you.
The full source code at Bgr24ToGrayBgr24Benchmark.cs
Upvotes: 1
Reputation: 21956
Your C++ SIMD implementation is completely wrong.
It’s relatively hard to efficiently process RGB24 pixels because all CPU registers have power of 2 size in bytes, i.e. when loading and storing data from memory, a register contains incomplete count of pixels.
For the same reason, no modern graphics libraries and hardware APIs support 3 bytes/pixel formats, instead they zero-pad each RGB pixel into 4 bytes.
Anyway, try the following version, it should hopefully do what you need. It assumes you’re building your C++ codes with VC++, other compilers don’t provide intrinsics for rep movsb
and rep stosb
instructions.
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
namespace
{
static const __m128i s_unpackTriplets = _mm_setr_epi8(
0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );
// Load 24 bytes from memory, zero extending triplets from RGB into RGBA
// The alpha bytes will be zeros
inline __m256i loadRgb8( const uint8_t* rsi )
{
// Load 24 bytes into 2 SSE vectors, 16 and 8 bytes respectively
const __m128i low = _mm_loadu_si128( ( const __m128i* )rsi );
__m128i high = _mm_loadu_si64( rsi + 16 );
// Make the high vector contain exactly 4 triplets = 12 bytes
high = _mm_alignr_epi8( high, low, 12 );
// Combine into AVX2 vector
__m256i res = _mm256_setr_m128i( low, high );
// Hope the compiler inlines this function, and moves the vbroadcasti128 outside of the loop
const __m256i perm = _mm256_broadcastsi128_si256( s_unpackTriplets );
// Unpack RGB24 into RGB32
return _mm256_shuffle_epi8( res, perm );
}
// Greyscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
constexpr uint8_t coeffR = 77; // 0.3 * 256 ≈ 77
constexpr uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
constexpr uint8_t coeffB = 29; // 0.11 * 256 ≈ 29
// Compute vector of int32 lanes with r*coeffR + g*coeffG + b*coeffB
inline __m256i makeGreyscale( __m256i rgba )
{
const __m256i lowBytesMask = _mm256_set1_epi32( 0x00FF00FF );
__m256i rb = _mm256_and_si256( rgba, lowBytesMask );
__m256i g = _mm256_and_si256( _mm256_srli_epi16( rgba, 8 ), lowBytesMask );
// Scale red and blue channels, then add pairwise into int32 lanes
constexpr int mulRbScalar = ( ( (int)coeffB ) << 16 ) | coeffR;
const __m256i mulRb = _mm256_set1_epi32( mulRbScalar );
rb = _mm256_madd_epi16( rb, mulRb );
// Scale green channel
const __m256i mulGreen = _mm256_set1_epi32( coeffG );
g = _mm256_mullo_epi16( g, mulGreen );
// Compute the result in 32-bit lanes
return _mm256_add_epi32( rb, g );
}
static const __m256i s_packTriplets = _mm256_setr_epi8(
// Low half of the vector: e0 e0 e0 e1 e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0
1, 1, 1, 5, 5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1,
// High half of the vector: e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 e0 e0 e0 e1
5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1, 1, 1, 1, 5 );
// Extract second byte from each int32 lane, triplicate these bytes, and store 24 bytes to memory
inline void storeRgb8( uint8_t* rdi, __m256i gs )
{
// Move bytes within 16 byte lanes
gs = _mm256_shuffle_epi8( gs, s_packTriplets );
// Split vector into halves
__m128i low = _mm256_castsi256_si128( gs );
const __m128i high = _mm256_extracti128_si256( gs, 1 );
// Insert high 4 bytes from high into low
low = _mm_blend_epi32( low, high, 0b1000 );
// Store 24 RGB bytes
_mm_storeu_si128( ( __m128i* )rdi, low );
_mm_storeu_si64( rdi + 16, high );
}
inline void computeGreyscale8( uint8_t* ptr )
{
__m256i v = loadRgb8( ptr );
v = makeGreyscale( v );
storeRgb8( ptr, v );
}
}
void ConvertRgbToGrayscaleSIMD( uint8_t* ptr, size_t length )
{
const size_t rem = length % 24;
uint8_t* const endAligned = ptr + ( length - rem );
for( ; ptr < endAligned; ptr += 24 )
computeGreyscale8( ptr );
if( rem != 0 )
{
// An easy way to handle remainder is using a local buffer of 24 bytes, reusing the implementation
// Unlike memcpy / memset which are function calls and are subject to ABI conventions,
// __movsb / __stosb don't destroy data in vector registers
uint8_t remSpan[ 24 ];
__movsb( remSpan, ptr, rem );
__stosb( &remSpan[ rem ], 0, 24 - rem );
computeGreyscale8( remSpan );
__movsb( ptr, remSpan, rem );
}
}
Upvotes: 2