Reputation: 123
This question provides kind of an alternative way for the issue described in Inefficient Loop Unrolling. But not, this question is absolute independent of the other one.
We have a primitive switch/case statement (part of duffs device if you want so) Godbold Test where the cases fall-through on purpose. -> it has a macro there, to specify what the maximum amount of case/statements can be.
The code produced is then something like this
inline __attribute__((__always_inline__)) void to_test(size_t from, size_t to) {
int num_invocations = 0;
{
const int __start_idx__ = (int)to - (int)20;
const size_t __switch_val__ = (size_t)20 - ((size_t)to - (size_t)from);
switch (__switch_val__) {
default:
__builtin_unreachable();
case 0: {
const int __iter_idx__ = __start_idx__ + 0;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 1: {
const int __iter_idx__ = __start_idx__ + 1;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 2: {
const int __iter_idx__ = __start_idx__ + 2;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 3: {
const int __iter_idx__ = __start_idx__ + 3;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 4: {
const int __iter_idx__ = __start_idx__ + 4;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 5: {
const int __iter_idx__ = __start_idx__ + 5;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 6: {
const int __iter_idx__ = __start_idx__ + 6;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 7: {
const int __iter_idx__ = __start_idx__ + 7;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 8: {
const int __iter_idx__ = __start_idx__ + 8;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 9: {
const int __iter_idx__ = __start_idx__ + 9;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 10: {
const int __iter_idx__ = __start_idx__ + 10;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 11: {
const int __iter_idx__ = __start_idx__ + 11;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 12: {
const int __iter_idx__ = __start_idx__ + 12;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 13: {
const int __iter_idx__ = __start_idx__ + 13;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 14: {
const int __iter_idx__ = __start_idx__ + 14;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 15: {
const int __iter_idx__ = __start_idx__ + 15;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 16: {
const int __iter_idx__ = __start_idx__ + 16;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 17: {
const int __iter_idx__ = __start_idx__ + 17;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 18: {
const int __iter_idx__ = __start_idx__ + 18;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
case 19: {
const int __iter_idx__ = __start_idx__ + 19;
{
printf("my index is %d\n", __iter_idx__);
++num_invocations;
}
}
}
}
printf("Num Invocations = %d\n", num_invocations);
}
void run_test(size_t end) {
to_test(0, end);
}
int main(int argc, char **argv) {
run_test(argc + 7);
return 0;
}
What happens is, if to_test(0,5) is called, it will print
my index is 0
my index is 1
my index is 2
my index is 3
my index is 4
So just a simple for loop, but it will directly jump to the correct location and execute the neccecary print statements. For this, only 1 jump statement should be required. (Jump table)
Disclaimer Please don't come with "printf" is more expensive,... This does absolutely not matter. Its only about the code the compiler produces. If you think it matters, please ignore this question as you have no idea.
Main Issue The compilers produces very suboptimal code. This is at least true for GCC (14.2). Clang (19.1.0) and icx (2024.0.0) seem to be more reasonable. (compiler params -Ofast)
Gcc produces something like this (small extract)
.L11:
lea esi, [rbx-8]
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
jmp .L10
.L35:
mov r14d, 9
.L12:
lea esi, [rbx-9]
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
jmp .L11
.L34:
mov r14d, 10
.L13:
lea esi, [rbx-10]
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
jmp .L12
.L33:
mov r14d, 11
.L14:
lea esi, [rbx-11]
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
jmp .L13
So it basically starts at the bottom of the code. And constantly jumps further back instead of running it in one go. From L14 it jumps to L13 then further back to L12,... Producing a jump fiesta.
Clang/ICX does it this way
mov rbx, rdi
mov rax, rdi
dec rax
lea rcx, [rip + .LJTI0_0]
movsxd rax, dword ptr [rcx + 4*rax]
add rax, rcx
jmp rax
.LBB0_1:
mov ebp, 1
jmp .LBB0_39
.LBB0_17:
mov ebp, 17
jmp .LBB0_23
.LBB0_15:
mov ebp, 15
jmp .LBB0_25
.LBB0_12:
mov ebp, 12
jmp .LBB0_28
.....
.LBB0_21:
lea esi, [rbx - 19]
lea rdi, [rip + .L.str]
xor eax, eax
call printf@PLT
.LBB0_22:
lea esi, [rbx - 18]
lea rdi, [rip + .L.str]
xor eax, eax
call printf@PLT
.LBB0_23:
lea esi, [rbx - 17]
lea rdi, [rip + .L.str]
xor eax, eax
call printf@PLT
.LBB0_24:
lea esi, [rbx - 16]
lea rdi, [rip + .L.str]
xor eax, eax
call printf@PLT
So both clang and icx produce a jump table to immediatelly resolve the index. From there they will then jump to the appropriate location.
Final question What is the reason for GCC to iterate the code the opposite way. Are there some other optimization parameters which forces GCC to do it in a more reasonable manner. Or am i mistaken and GCC is the optimal way?
Upvotes: 2
Views: 105