Reputation: 1124
I have a function which downscales an 8-Bit image by a factor of two. I have previously optimised the rgb32 case with SSE. Now I would like to do the same for the gray8 case.
At the core, there is a function taking two lines of pixel data, which works like this:
/**
* Calculates the average of two rows of gray8 pixels by averaging four pixels.
*/
void average2Rows(const uint8_t* row1, const uint8_t* row2, uint8_t* dst, int size)
{
for (int i = 0; i < size - 1; i += 2)
*(dst++) = ((row1[i]+row1[i+1]+row2[i]+row2[i+1])/4)&0xFF;
}
Now, I have come up with an SSE variant which is about three times faster, but it does involve a lot of shuffling and I think one might do better. Does anybody see what can be optimised here?
/* row1: 16 8-bit values A-P
* row2: 16 8-bit values a-p
* returns 16 8-bit values (A+B+a+b)/4, (C+D+c+d)/4, ..., (O+P+o+p)/4
*/
__m128i avg16Bytes(const __m128i& row1, const __m128i& row2)
{
static const __m128i zero = _mm_setzero_si128();
__m128i ABCDEFGHIJKLMNOP = _mm_avg_epu8(row1_u8, row2);
__m128i ABCDEFGH = _mm_unpacklo_epi8(ABCDEFGHIJKLMNOP, zero);
__m128i IJKLMNOP = _mm_unpackhi_epi8(ABCDEFGHIJKLMNOP, zero);
__m128i AIBJCKDL = _mm_unpacklo_epi16( ABCDEFGH, IJKLMNOP );
__m128i EMFNGOHP = _mm_unpackhi_epi16( ABCDEFGH, IJKLMNOP );
__m128i AEIMBFJN = _mm_unpacklo_epi16( AIBJCKDL, EMFNGOHP );
__m128i CGKODHLP = _mm_unpackhi_epi16( AIBJCKDL, EMFNGOHP );
__m128i ACEGIKMO = _mm_unpacklo_epi16( AEIMBFJN, CGKODHLP );
__m128i BDFHJLNP = _mm_unpackhi_epi16( AEIMBFJN, CGKODHLP );
return _mm_avg_epu8(ACEGIKMO, BDFHJLNP);
}
/*
* Calculates the average of two rows of gray8 pixels by averaging four pixels.
*/
void average2Rows(const uint8_t* src1, const uint8_t* src2, uint8_t* dst, int size)
{
for(int i = 0;i<size-31; i+=32)
{
__m128i tl = _mm_loadu_si128((__m128i const*)(src1+i));
__m128i tr = _mm_loadu_si128((__m128i const*)(src1+i+16));
__m128i bl = _mm_loadu_si128((__m128i const*)(src2+i));
__m128i br = _mm_loadu_si128((__m128i const*)(src2+i+16)))
__m128i l_avg = avg16Bytes(tl, bl);
__m128i r_avg = avg16Bytes(tr, br);
_mm_storeu_si128((__m128i *)(dst+(i/2)), _mm_packus_epi16(l_avg, r_avg));
}
}
Notes:
EDIT: There is now a github repository implementing the answers to this question. The fastest solution was provided by user Peter Cordes. See his essay below for details:
__m128i avg16Bytes(const __m128i& row1, const __m128i& row2)
{
// Average the first 16 values of src1 and src2:
__m128i avg = _mm_avg_epu8(row1, row2);
// Unpack and horizontal add:
avg = _mm_maddubs_epi16(avg, _mm_set1_epi8(1));
// Divide by 2:
return _mm_srli_epi16(avg, 1);
}
It works as my original implementation by calculating (a+b)/2 + (c+d)/2
as opposed to (a+b+c+d)/4
, so it has the same off-by-one rounding error.
Cudos to user Paul R for implementing a solution which is twice as fast as mine, but exact:
__m128i avg16Bytes(const __m128i& row1, const __m128i& row2)
{
// Unpack and horizontal add:
__m128i row1 = _mm_maddubs_epi16(row1_u8, _mm_set1_epi8(1));
__m128i row2 = _mm_maddubs_epi16(row2_u8, _mm_set1_epi8(1));
// vertical add:
__m128i avg = _mm_add_epi16(row1_avg, row2_avg);
// divide by 4:
return _mm_srli_epi16(avg, 2);
}
Upvotes: 2
Views: 1251
Reputation: 364160
If you're willing to accept double-rounding from using pavgb
twice, you can go faster than Paul R's answer by doing the vertical averaging first with pavgb
, cutting in half the amount of data that needs to be unpacked to 16-bit elements. (And allowing half the loads to fold into memory operands for pavgb
, reducing front-end bottlenecks on some CPUs.)
For horizontal averaging, your best bet is probably still pmaddubsw
with set1(1)
and shift by 1, then pack.
// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows_doubleround(const uint8_t* __restrict__ src1, const uint8_t*__restrict__ src2,
uint8_t*__restrict__ dst, size_t size)
{
const __m128i vk1 = _mm_set1_epi8(1);
size_t dstsize = size/2;
for (size_t i = 0; i < dstsize - 15; i += 16)
{
__m128i v0 = _mm_load_si128((const __m128i *)&src1[i*2]);
__m128i v1 = _mm_load_si128((const __m128i *)&src1[i*2 + 16]);
__m128i v2 = _mm_load_si128((const __m128i *)&src2[i*2]);
__m128i v3 = _mm_load_si128((const __m128i *)&src2[i*2 + 16]);
__m128i left = _mm_avg_epu8(v0, v2);
__m128i right = _mm_avg_epu8(v1, v3);
__m128i w0 = _mm_maddubs_epi16(left, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(right, vk1);
w0 = _mm_srli_epi16(w0, 1); // divide by 2
w1 = _mm_srli_epi16(w1, 1);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i], w0);
}
}
The other option is _mm_srli_epi16(v, 8)
to line up the odd elements with the even elements of every horizontal pair. But since there is no horizontal pack with truncation, you have to _mm_and_si128(v, _mm_set1_epi16(0x00FF))
both halves before you pack. It turns out to be slower than using SSSE3 pmaddubsw
, especially without AVX where it takes extra MOVDQA instructions to copy registers.
void average2Rows_doubleround_SSE2(const uint8_t* __restrict__ src1, const uint8_t* __restrict__ src2, uint8_t* __restrict__ dst, size_t size)
{
size /= 2;
for (size_t i = 0; i < size - 15; i += 16)
{
__m128i v0 = _mm_load_si128((__m128i *)&src1[i*2]);
__m128i v1 = _mm_load_si128((__m128i *)&src1[i*2 + 16]);
__m128i v2 = _mm_load_si128((__m128i *)&src2[i*2]);
__m128i v3 = _mm_load_si128((__m128i *)&src2[i*2 + 16]);
__m128i left = _mm_avg_epu8(v0, v2);
__m128i right = _mm_avg_epu8(v1, v3);
__m128i l_odd = _mm_srli_epi16(left, 8); // line up horizontal pairs
__m128i r_odd = _mm_srli_epi16(right, 8);
__m128i l_avg = _mm_avg_epu8(left, l_odd); // leaves garbage in the high halves
__m128i r_avg = _mm_avg_epu8(right, r_odd);
l_avg = _mm_and_si128(l_avg, _mm_set1_epi16(0x00FF));
r_avg = _mm_and_si128(r_avg, _mm_set1_epi16(0x00FF));
__m128i avg = _mm_packus_epi16(l_avg, r_avg); // pack
_mm_storeu_si128((__m128i *)&dst[i], avg);
}
}
With AVX512BW, there's _mm_cvtepi16_epi8
, but IACA says it's 2 uops on Skylake-AVX512, and it only takes 1 input and produces a half-width output. According to IACA, the memory-destination form is 4 total unfused-domain uops (same as reg,reg + separate store). I had to use _mm_mask_cvtepi16_storeu_epi8(&dst\[i+0\], -1, l_avg);
to get it, because gcc and clang fail to fold a separate _mm_store
into a memory destination for vpmovwb
. (There is no non-masked store intrinsic, because compilers are supposed to do that for you like they do with folding _mm_load
into memory operands for typical ALU instructions).
It's probably only useful when narrowing to 1/4 or 1/8th (cvtepi64_epi8
), not just in half. Or maybe useful to avoid needing a second shuffle to deal with the in-lane behaviour of _mm512_packus_epi16
. With AVX2, after a _mm256_packus_epi16
on [D C] [B A]
, you have [D B | C A]
, which you can fix with an AVX2 _mm256_permute4x64_epi64 (__m256i a, const int imm8)
to shuffle in 64-bit chunks. But with AVX512, you'd need a vector shuffle-control for the vpermq
. packus
+ a fixup shuffle is probably still a better option, though.
Once you do this, there aren't many vector instructions left in the loop, and there's a lot to gain from letting the compiler make tighter asm. Your loop is unfortunately difficult for compilers to do a good job with. (This also helps Paul R's solution, since he copied the compiler-unfriendly loop structure from the question.)
Use the loop-counter in a way that gcc/clang can optimize better, and use types that avoid re-doing sign extension every time through the loop.
With your current loop, gcc/clang actually do an arithmetic right-shift for i/2
, instead of incrementing by 16 (instead of 32) and using scaled-index addressing modes for the loads. It seems they don't realize that i
is always even.
(full code + asm on Matt Godbolt's compiler explorer):
.LBB1_2: ## clang's inner loop for int i, dst[i/2] version
movdqu xmm1, xmmword ptr [rdi + rcx]
movdqu xmm2, xmmword ptr [rdi + rcx + 16]
movdqu xmm3, xmmword ptr [rsi + rcx]
movdqu xmm4, xmmword ptr [rsi + rcx + 16]
pavgb xmm3, xmm1
pavgb xmm4, xmm2
pmaddubsw xmm3, xmm0
pmaddubsw xmm4, xmm0
psrlw xmm3, 1
psrlw xmm4, 1
packuswb xmm3, xmm4
mov eax, ecx # This whole block is wasted instructions!!!
shr eax, 31
add eax, ecx
sar eax # eax = ecx/2, with correct rounding even for negative `i`
cdqe # sign-extend EAX into RAX
movdqu xmmword ptr [rdx + rax], xmm3
add rcx, 32 # i += 32
cmp rcx, r8
jl .LBB1_2 # }while(i < size-31)
gcc7.1 isn't quite so bad, (just mov
/sar
/movsx
), but gcc5.x and 6.x do separate pointer-increments for src1 and src2, and also for a counter/index for the stores. (Totally braindead behaviour, especially since they still do it with -march=sandybridge
. Indexed movdqu
stores and non-indexed movdqu
loads gives you the maximum loop overhead.)
Anyway, using dstsize
and multiplying i
inside the loop instead of dividing it gives much better results. Different versions of gcc and clang reliably compile it into a single loop-counter that they use with a scaled-index addressing mode for the loads. You get code like:
movdqa xmm1, xmmword ptr [rdi + 2*rax]
movdqa xmm2, xmmword ptr [rdi + 2*rax + 16]
pavgb xmm1, xmmword ptr [rsi + 2*rax]
pavgb xmm2, xmmword ptr [rsi + 2*rax + 16] # saving instructions with aligned loads, see below
...
movdqu xmmword ptr [rdx + rax], xmm1
add rax, 16
cmp rax, rcx
jb .LBB0_2
I used size_t i
to match size_t
size, to make sure gcc didn't waste any instructions sign-extending or zero-extending it to the width of a pointer. (zero-extension usually happens for free, though, so unsigned size
and unsigned i
might have been ok, and saved a couple REX prefixes.)
You could still get rid of the cmp
but counting an index up towards 0, which would speed the loop up a little bit more than what I've done. I'm not sure how easy it would be to get compilers to not be stupid and omit the cmp
instruction if you do count up towards zero. Indexing from the end of an object is no problem, though. src1+=size;
. It does complicate things if you want to use an unaligned-cleanup loop, though.
On my Skylake i7-6700k (max turbo 4.4GHz, but look at the clock-cycle counts rather than times). With g++7.1, this makes a difference of ~2.7 seconds for 100M reps of 1024 bytes vs. ~3.3 seconds.
Performance counter stats for './grayscale-dowscale-by-2.inline.gcc-skylake-noavx' (2 runs):
2731.607950 task-clock (msec) # 1.000 CPUs utilized ( +- 0.40% )
2 context-switches # 0.001 K/sec ( +- 20.00% )
0 cpu-migrations # 0.000 K/sec
88 page-faults:u # 0.032 K/sec ( +- 0.57% )
11,917,723,707 cycles # 4.363 GHz ( +- 0.07% )
42,006,654,015 instructions # 3.52 insn per cycle ( +- 0.00% )
41,908,837,143 uops_issued_any # 15342.186 M/sec ( +- 0.00% )
49,409,631,052 uops_executed_thread # 18088.112 M/sec ( +- 0.00% )
3,301,193,901 branches # 1208.517 M/sec ( +- 0.00% )
100,013,629 branch-misses # 3.03% of all branches ( +- 0.01% )
2.731715466 seconds time elapsed ( +- 0.40% )
vs. Same vectorization, but with int i
and dst[i/2]
creating higher loop overhead (more scalar instructions):
Performance counter stats for './grayscale-dowscale-by-2.loopoverhead-aligned-inline.gcc-skylake-noavx' (2 runs):
3314.335833 task-clock (msec) # 1.000 CPUs utilized ( +- 0.02% )
4 context-switches # 0.001 K/sec ( +- 14.29% )
0 cpu-migrations # 0.000 K/sec
88 page-faults:u # 0.026 K/sec ( +- 0.57% )
14,531,925,552 cycles # 4.385 GHz ( +- 0.06% )
51,607,478,414 instructions # 3.55 insn per cycle ( +- 0.00% )
51,109,303,460 uops_issued_any # 15420.677 M/sec ( +- 0.00% )
55,810,234,508 uops_executed_thread # 16839.040 M/sec ( +- 0.00% )
3,301,344,602 branches # 996.080 M/sec ( +- 0.00% )
100,025,451 branch-misses # 3.03% of all branches ( +- 0.00% )
3.314418952 seconds time elapsed ( +- 0.02% )
vs. Paul R's version (optimized for lower loop overhead): exact but slower
Performance counter stats for './grayscale-dowscale-by-2.paulr-inline.gcc-skylake-noavx' (2 runs):
3751.990587 task-clock (msec) # 1.000 CPUs utilized ( +- 0.03% )
3 context-switches # 0.001 K/sec
0 cpu-migrations # 0.000 K/sec
88 page-faults:u # 0.024 K/sec ( +- 0.56% )
16,323,525,446 cycles # 4.351 GHz ( +- 0.04% )
58,008,101,634 instructions # 3.55 insn per cycle ( +- 0.00% )
57,610,721,806 uops_issued_any # 15354.709 M/sec ( +- 0.00% )
55,505,321,456 uops_executed_thread # 14793.566 M/sec ( +- 0.00% )
3,301,456,435 branches # 879.921 M/sec ( +- 0.00% )
100,001,954 branch-misses # 3.03% of all branches ( +- 0.02% )
3.752086635 seconds time elapsed ( +- 0.03% )
vs. Paul R's original version with extra loop overhead:
Performance counter stats for './grayscale-dowscale-by-2.loopoverhead-paulr-inline.gcc-skylake-noavx' (2 runs):
4154.300887 task-clock (msec) # 1.000 CPUs utilized ( +- 0.01% )
3 context-switches # 0.001 K/sec
0 cpu-migrations # 0.000 K/sec
90 page-faults:u # 0.022 K/sec ( +- 1.68% )
18,174,791,383 cycles # 4.375 GHz ( +- 0.03% )
67,608,724,157 instructions # 3.72 insn per cycle ( +- 0.00% )
66,937,292,129 uops_issued_any # 16112.769 M/sec ( +- 0.00% )
61,875,610,759 uops_executed_thread # 14894.350 M/sec ( +- 0.00% )
3,301,571,922 branches # 794.736 M/sec ( +- 0.00% )
100,029,270 branch-misses # 3.03% of all branches ( +- 0.00% )
4.154441330 seconds time elapsed ( +- 0.01% )
Note that branch-misses is about the same as the repeat count: the inner loop mispredicts at the end every time. Unrolling to keep the loop iteration count under about 22 would make the pattern short enough for Skylake's branch predictors to predict the not-taken condition correctly most of the time. Branch mispredicts are the only reason we're not getting ~4.0 uops per cycle through the pipeline, so avoiding branch misses would raise the IPC from 3.5 to over 4.0 (cmp/jcc macro-fusion puts 2 instructions in one uop).
These branch-misses probably hurt even if you're bottlenecked on L2 cache bandwidth (instead of the front-end). I didn't test that, though: my testing just wraps a for()
loop around the function call from Paul R's test harness, so everything's hot in L1D cache. 32 iterations of the inner loop is close to the worst-case here: low enough for frequent mispredicts, but not so low that branch-prediction can pick up the pattern and avoid them.
My version should run in 3 cycles per iteration, bottlenecked only on the frontend, on Intel Sandybridge and later. (Nehalem will bottleneck on one load per clock.)
See http://agner.org/optimize/, and also Can x86's MOV really be "free"? Why can't I reproduce this at all? for more about fused-domain vs. unfused-domain uops and perf counters.
update: clang unrolls it for you, at least when the size is a compile-time constant... Strangely, it unrolls even the non-inline version of the dst[i/2]
function (with unknown size
), but not the lower-loop-overhead version.
With clang++-4.0 -O3 -march=skylake -mno-avx
, my version (unrolled by 2 by the compiler) runs in: 9.61G cycles for 100M iters (2.2s). (35.6G uops issued (fused domain), 45.0G uops executed (unfused domain), near-zero branch-misses.) Probably not bottlenecked on the front-end anymore, but AVX would still hurt.
Paul R's (also unrolled by 2) runs in 12.29G cycles for 100M iters (2.8s). 48.4G uops issued (fused domain), 51.4G uops executed (unfused-domain). 50.1G instructions, for 4.08 IPC, probably still bottlenecked on the front-end (because it needs a couple movdqa
instructions to copy a register before destroying it). AVX would help for non-destructive vector instructions, even without AVX2 for wider integer vectors.
With careful coding, you should be able to do about this well for runtime-variable sizes.
Use aligned pointers and aligned loads, so the compiler can use pavgb
with a memory operand instead of using a separate unaligned-load instruction. This means fewer instructions and fewer uops for the front-end, which is a bottleneck for this loop.
This doesn't help Paul's version, because only the second operand for pmaddubsw
can come from memory, and that's the one treated as signed bytes. If we used _mm_maddubs_epi16(_mm_set1_epi8(1), v0);
, the 16-bit multiply result would be sign-extended instead of zero-extended. So 1+255
would come out to 0 instead of 256.
Folding a load requires alignment with SSE, but not with AVX. However, on Intel Haswell/Skylake, indexed addressing modes can only stay micro-fused with instructions which read-modify-write their destination register. vpavgb xmm0, xmm0, [rsi+rax*2]
is un-laminated to 2 uops on Haswell/Skylake before it issues into the out-of-order part of the core, but pavgb xmm1, [rsi+rax*2]
can stay micro-fused all the way through, so it issues as a single uop. The front-end issue bottleneck is 4 fused-domain uops per clock on mainstream x86 CPUs except Ryzen (i.e. not Atom/Silvermont). Folding half the loads into memory operands helps with that on all Intel CPUs except Sandybridge/Ivybridge, and all AMD CPUs.
gcc and clang will fold the loads when inlining into a test function that uses alignas(32)
, even if you use _mm_loadu
intrinsics. They know the data is aligned, and take advantage.
Weird fact: compiling the 128b-vectorized code with AVX code-gen enabled (-march=native
) actually slows it down on Haswell/Skylake, because it would make all 4 loads issue as separate uops even when they're memory operands for vpavgb
, and there aren't any movdqa
register-copying instructions that AVX would avoid. (Usually AVX comes out ahead anyway even for manually-vectorized code that still only uses 128b vectors, because of the benefit of 3-operand instructions not destroying one of their inputs.) In this case, 13,53G cycles ( +- 0.05% )
or 3094.195773 ms ( +- 0.20% )
, up from 11.92G
cycles in ~2.7 seconds. uops_issued = 48.508G
, up from 41,908
. Instruction count and uops_executed counts are essentially the same.
OTOH, an actual 256b AVX2 version would run slightly a bit less than twice as fast. Some unrolling to reduce the front-end bottleneck will definitely help. An AVX512 version might run close to 4x as fast on Skylake-AVX512 Xeons, but might bottleneck on ALU throughput since SKX shuts down execution port1 when there are any 512b uops in the RS waiting to execute, according to @Mysticial's testing. (That explains why pavgb zmm
has 1 per clock throughput while pavgb ymm
is 2 per clock..)
To have both input rows aligned, store your image data in a format with a row stride that's a multiple of 16, even if the actual image dimensions are odd. Your storage stride doesn't have to match your actual image dimensions.
If you can only align either the source or dest (e.g. because you're downscaling a region that starts at an odd column in the source image), you should probably still align your source pointers.
Intel's optimization manual recommends aligning the destination instead of the source, if you can't align both, but doing 4x as many loads as stores probably changes the balance.
To handle unaligned at the start/end, do a potentially-overlapping unaligned vector of pixels from the start and end. It's fine for stores to overlap other stores, and since dst is separate from src, you can redo a partially-overlapping vector.
In Paul's test main()
, I just added alignas(32)
in front of every array.
Since you compile one version with -march=native
, you can easily detect AVX2 at compile time with #ifdef __AVX2__
. There's no simple way to use exactly the same code for 128b and 256b manual vectorization. All the intrinsics have different names, so you typically need to copy everything even if there are no other differences.
(There are some C++ wrapper libraries for the intrinsics that use operator-overloading and function overloading to let you write a templated version that uses the same logic on different widths of vector. e.g. Agner Fog's VCL is good, but unless your software is open-source, you can't use it because it's GPL licensed and you want to distribute a binary.)
To take advantage of AVX2 in your binary-distribution version, you'd have to do runtime detection/dispatching. In that case, you'd want to dispatch to versions of a function that loops over rows, so you don't have dispatch overhead inside your loop over rows. Or just let that version use SSSE3.
Upvotes: 6
Reputation: 212949
Here is an implementation which uses fewer instructions. I haven't benchmarked it against your code though, so it may not be significantly faster:
void average2Rows(const uint8_t* src1, const uint8_t* src2, uint8_t* dst, int size)
{
const __m128i vk1 = _mm_set1_epi8(1);
for (int i = 0; i < size - 31; i += 32)
{
__m128i v0 = _mm_loadu_si128((__m128i *)&src1[i]);
__m128i v1 = _mm_loadu_si128((__m128i *)&src1[i + 16]);
__m128i v2 = _mm_loadu_si128((__m128i *)&src2[i]);
__m128i v3 = _mm_loadu_si128((__m128i *)&src2[i + 16]);
__m128i w0 = _mm_maddubs_epi16(v0, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(v1, vk1);
__m128i w2 = _mm_maddubs_epi16(v2, vk1);
__m128i w3 = _mm_maddubs_epi16(v3, vk1);
w0 = _mm_add_epi16(w0, w2); // vertical add
w1 = _mm_add_epi16(w1, w3);
w0 = _mm_srli_epi16(w0, 2); // divide by 4
w1 = _mm_srli_epi16(w1, 2);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i / 2], w0);
}
}
Test harness:
#include <stdio.h>
#include <stdlib.h>
#include <tmmintrin.h>
void average2Rows_ref(const uint8_t* row1, const uint8_t* row2, uint8_t* dst, int size)
{
for (int i = 0; i < size - 1; i += 2)
{
dst[i / 2] = (row1[i] + row1[i + 1] + row2[i] + row2[i + 1]) / 4;
}
}
void average2Rows(const uint8_t* src1, const uint8_t* src2, uint8_t* dst, int size)
{
const __m128i vk1 = _mm_set1_epi8(1);
for (int i = 0; i < size - 31; i += 32)
{
__m128i v0 = _mm_loadu_si128((__m128i *)&src1[i]);
__m128i v1 = _mm_loadu_si128((__m128i *)&src1[i + 16]);
__m128i v2 = _mm_loadu_si128((__m128i *)&src2[i]);
__m128i v3 = _mm_loadu_si128((__m128i *)&src2[i + 16]);
__m128i w0 = _mm_maddubs_epi16(v0, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(v1, vk1);
__m128i w2 = _mm_maddubs_epi16(v2, vk1);
__m128i w3 = _mm_maddubs_epi16(v3, vk1);
w0 = _mm_add_epi16(w0, w2); // vertical add
w1 = _mm_add_epi16(w1, w3);
w0 = _mm_srli_epi16(w0, 2); // divide by 4
w1 = _mm_srli_epi16(w1, 2);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i / 2], w0);
}
}
int main()
{
const int n = 1024;
uint8_t src1[n];
uint8_t src2[n];
uint8_t dest_ref[n / 2];
uint8_t dest_test[n / 2];
for (int i = 0; i < n; ++i)
{
src1[i] = rand();
src2[i] = rand();
}
for (int i = 0; i < n / 2; ++i)
{
dest_ref[i] = 0xaa;
dest_test[i] = 0x55;
}
average2Rows_ref(src1, src2, dest_ref, n);
average2Rows(src1, src2, dest_test, n);
for (int i = 0; i < n / 2; ++i)
{
if (dest_test[i] != dest_ref[i])
{
printf("%u %u %u %u: ref = %u, test = %u\n", src1[2 * i], src1[2 * i + 1], src2[2 * i], src2[2 * i + 1], dest_ref[i], dest_test[i]);
}
}
return 0;
}
Note that the output of the SIMD version exactly matches the output of the scalar reference code (no "off by one" rounding errors).
Upvotes: 3