Reputation: 6484
So, I have the following code:
uint32_t val;
if (swap) {
val = ((uint32_t)a & 0x0000ffff) | ((uint32_t)b << 16);
} else {
val = ((uint32_t)b & 0x0000ffff) | ((uint32_t)a << 16);
}
Is there a way to optimize it, and have swap
checking somehow embedded in the statement?
Upvotes: 1
Views: 85
Reputation: 4370
In a similar vein to John Bollinger's answer that avoids any branching, I came up with the following to try to reduce the amount of operations performed, especially multiplication.
uint8_t shift_mask = (uint8_t) !swap * 16;
val = ((uint32_t) a << (shift_mask)) | ((uint32_t)b << ( 16 ^ shift_mask ));
Neither compiler actually even uses a multiplication instruction since the only multiplication here is by a power of two, so it just uses a simple left shift to construct the value that will be used to shift either a
or b
.
Dissassembly of original with Clang -O2
0000000000000000 <cat>:
0: 85 d2 test %edx,%edx
2: 89 f0 mov %esi,%eax
4: 66 0f 45 c7 cmovne %di,%ax
8: 66 0f 45 fe cmovne %si,%di
c: 0f b7 c0 movzwl %ax,%eax
f: c1 e7 10 shl $0x10,%edi
12: 09 f8 or %edi,%eax
14: c3 retq
15: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
1c: 00 00 00 00
Dissassembly of new version with Clang -O2
0000000000000000 <cat>:
0: 80 f2 01 xor $0x1,%dl
3: 0f b6 ca movzbl %dl,%ecx
6: c1 e1 04 shl $0x4,%ecx
9: d3 e7 shl %cl,%edi
b: 83 f1 10 xor $0x10,%ecx
e: d3 e6 shl %cl,%esi
10: 09 fe or %edi,%esi
12: 89 f0 mov %esi,%eax
14: c3 retq
15: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
1c: 00 00 00 00
Disassembly of original version with gcc -O2
0000000000000000 <cat>:
0: 84 d2 test %dl,%dl
2: 75 0c jne 10 <cat+0x10>
4: 89 f8 mov %edi,%eax
6: 0f b7 f6 movzwl %si,%esi
9: c1 e0 10 shl $0x10,%eax
c: 09 f0 or %esi,%eax
e: c3 retq
f: 90 nop
10: 89 f0 mov %esi,%eax
12: 0f b7 ff movzwl %di,%edi
15: c1 e0 10 shl $0x10,%eax
18: 09 f8 or %edi,%eax
1a: c3 retq
Disassembly of new version with gcc -O2
0000000000000000 <cat>:
0: 83 f2 01 xor $0x1,%edx
3: 0f b7 c6 movzwl %si,%eax
6: 0f b7 ff movzwl %di,%edi
9: c1 e2 04 shl $0x4,%edx
c: 89 d1 mov %edx,%ecx
e: 83 f1 10 xor $0x10,%ecx
11: d3 e0 shl %cl,%eax
13: 89 d1 mov %edx,%ecx
15: d3 e7 shl %cl,%edi
17: 09 f8 or %edi,%eax
19: c3 retq
EDIT:
As John Bollinger pointed out, this solution was written under the assumption that a
and b
were unsigned values rendering the bit-masking redundant. If this approach is to be used with signed values under 32-bits, then it would need modification:
uint8_t shift_mask = (uint8_t) !swap * 16;
val = ((uint32_t) (a & 0xFFFF) << (shift_mask)) | ((uint32_t) (b & 0xFFFF) << ( 16 ^ shift_mask ));
I won't go too far into the disassembly of this version, but here's the clang output at -O2:
0000000000000000 <cat>:
0: 80 f2 01 xor $0x1,%dl
3: 0f b6 ca movzbl %dl,%ecx
6: c1 e1 04 shl $0x4,%ecx
9: 0f b7 d7 movzwl %di,%edx
c: d3 e2 shl %cl,%edx
e: 0f b7 c6 movzwl %si,%eax
11: 83 f1 10 xor $0x10,%ecx
14: d3 e0 shl %cl,%eax
16: 09 d0 or %edx,%eax
18: c3 retq
19: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
In response to P__J__ in regards to performance versus his union solution, here is what clang spits out at -O3
for the version of this code that is safe for dealing with signed types:
0000000000000000 <cat>:
0: 85 d2 test %edx,%edx
2: 89 f0 mov %esi,%eax
4: 66 0f 45 c7 cmovne %di,%ax
8: 66 0f 45 fe cmovne %si,%di
c: 0f b7 c0 movzwl %ax,%eax
f: c1 e7 10 shl $0x10,%edi
12: 09 f8 or %edi,%eax
14: c3 retq
15: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
1c: 00 00 00 00
It is a bit closer to the union solution in total instructions, but does not use SHRD which, according to This answer, it takes 4 clocks to perform on an intel skylake processor and uses up several operation units. I'd be mildly curious how they would each actually perform.
Upvotes: 1
Reputation: 180286
If the objective is to avoid a branch, then you can write this:
val = ((!!swap) * (uint32_t)a + (!swap) * (uint32_t)b) & 0x0000ffff)
| (((!!swap) * (uint32_t)b + (!swap) * (uint32_t)a) << 16);
This uses the fact that !x
evaluates to 0 whenever swap
is truthy and to 1 whenever swap
is falsey, and so also !!x
evaluates to 1 when x
is truthy, even though x
may not itself be 1. Multiplying by the result selects either a
or b
as appropriate.
Note, however, that instead of one compare and branch you now have multiple logical and arithmetic operations. It is not at all clear that that would provide a performance improvement in practice.
Courtesy of @ChristianGibbons:
[Provided that a
and b
are guaranteed non-negative and less than 216,] you can simplify this approach substantially by removing the bitwise AND component and applying the multiplication to the shifts instead of to the arguments:
val = ((uint32_t) a << (16 * !swap)) | ((uint32_t)b << (16 * !!swap));
That stands a better chance of outperforming the original code (but is still by no means certain to do so), but in that case a more fair comparison would be with a version of the original that relies on the same properties of the inputs:
uint32_t val;
if (swap) {
val = (uint32_t)a | ((uint32_t)b << 16);
} else {
val = (uint32_t)b | ((uint32_t)a << 16);
}
Upvotes: 2
Reputation: 67546
There us not too much to optimize
Here you have two versions
typedef union
{
uint16_t u16[2];
uint32_t u32;
}D32_t;
uint32_t foo(uint32_t a, uint32_t b, int swap)
{
D32_t da = {.u32 = a}, db = {.u32 = b}, val;
if(swap)
{
val.u16[0] = da.u16[1];
val.u16[1] = db.u16[0];
}
else
{
val.u16[0] = db.u16[1];
val.u16[1] = da.u16[0];
}
return val.u32;
}
uint32_t foo2(uint32_t a, uint32_t b, int swap)
{
uint32_t val;
if (swap)
{
val = ((uint32_t)a & 0x0000ffff) | ((uint32_t)b << 16);
}
else
{
val = ((uint32_t)b & 0x0000ffff) | ((uint32_t)a << 16);
}
return val;
}
the generated code is almost the same.
clang:
foo: # @foo
mov eax, edi
test edx, edx
mov ecx, esi
cmove ecx, edi
cmove eax, esi
shrd eax, ecx, 16
ret
foo2: # @foo2
movzx ecx, si
movzx eax, di
shl edi, 16
or edi, ecx
shl esi, 16
or eax, esi
test edx, edx
cmove eax, edi
ret
gcc:
foo:
test edx, edx
je .L2
shr edi, 16
mov eax, esi
mov edx, edi
sal eax, 16
mov ax, dx
ret
.L2:
shr esi, 16
mov eax, edi
mov edx, esi
sal eax, 16
mov ax, dx
ret
foo2:
test edx, edx
je .L6
movzx eax, di
sal esi, 16
or eax, esi
ret
.L6:
movzx eax, si
sal edi, 16
or eax, edi
ret
As you see clang likes unions, gcc shifts.
Upvotes: 1
Reputation: 133929
Compile with -O3
. GCC and Clang have slightly different strategies for 64-bit processors. GCC generates code with branch whereas Clang will run both branches and then use conditional move. Both GCC and Clang will generate a "zero-extend short to int" instruction instead of and
.
Using ?:
didn't change the generated code in either.
The Clang version does seem more efficient.
All in all, both would generate the same code if you didn't need the swap.
Upvotes: 0
Reputation: 453
val = swap ? ((uint32_t)a & 0x0000ffff) | ((uint32_t)b << 16) : ((uint32_t)b & 0x0000ffff) | ((uint32_t)a << 16);
This will achieve the "embedding" you ask for. However, I don't recommend this as it makes readability worse and no runtime optimization.
Upvotes: 0