C Switch/Case compiler produces jump fiesta (GCC very suboptimal)

Question

This question provides kind of an alternative way for the issue described in Inefficient Loop Unrolling. But not, this question is absolute independent of the other one.

We have a primitive switch/case statement (part of duffs device if you want so) Godbold Test where the cases fall-through on purpose. -> it has a macro there, to specify what the maximum amount of case/statements can be.

The code produced is then something like this

inline __attribute__((__always_inline__)) void to_test(size_t from, size_t to) {
    int num_invocations = 0;
    {
        const int __start_idx__ = (int)to - (int)20;
        const size_t __switch_val__ = (size_t)20 - ((size_t)to - (size_t)from);
        switch (__switch_val__) {
            default:
                __builtin_unreachable();
            case 0: {
                const int __iter_idx__ = __start_idx__ + 0;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 1: {
                const int __iter_idx__ = __start_idx__ + 1;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 2: {
                const int __iter_idx__ = __start_idx__ + 2;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 3: {
                const int __iter_idx__ = __start_idx__ + 3;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 4: {
                const int __iter_idx__ = __start_idx__ + 4;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 5: {
                const int __iter_idx__ = __start_idx__ + 5;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 6: {
                const int __iter_idx__ = __start_idx__ + 6;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 7: {
                const int __iter_idx__ = __start_idx__ + 7;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 8: {
                const int __iter_idx__ = __start_idx__ + 8;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 9: {
                const int __iter_idx__ = __start_idx__ + 9;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 10: {
                const int __iter_idx__ = __start_idx__ + 10;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 11: {
                const int __iter_idx__ = __start_idx__ + 11;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 12: {
                const int __iter_idx__ = __start_idx__ + 12;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 13: {
                const int __iter_idx__ = __start_idx__ + 13;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 14: {
                const int __iter_idx__ = __start_idx__ + 14;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 15: {
                const int __iter_idx__ = __start_idx__ + 15;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 16: {
                const int __iter_idx__ = __start_idx__ + 16;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 17: {
                const int __iter_idx__ = __start_idx__ + 17;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 18: {
                const int __iter_idx__ = __start_idx__ + 18;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
            case 19: {
                const int __iter_idx__ = __start_idx__ + 19;
                {
                    printf("my index is %d
", __iter_idx__);
                    ++num_invocations;
                }
            }
        }
    }

    printf("Num Invocations = %d
", num_invocations);
}

void run_test(size_t end) { 
    to_test(0, end); 
}

int main(int argc, char **argv) {
    run_test(argc + 7);
    return 0;
}

What happens is, if to_test(0,5) is called, it will print

my index is 0
my index is 1
my index is 2
my index is 3
my index is 4

So just a simple for loop, but it will directly jump to the correct location and execute the neccecary print statements. For this, only 1 jump statement should be required. (Jump table)

Disclaimer Please don't come with "printf" is more expensive,... This does absolutely not matter. Its only about the code the compiler produces. If you think it matters, please ignore this question as you have no idea.

Main Issue The compilers produces very suboptimal code. This is at least true for GCC (14.2). Clang (19.1.0) and icx (2024.0.0) seem to be more reasonable. (compiler params -Ofast)

Gcc produces something like this (small extract)

.L11:
        lea     esi, [rbx-8]
        mov     edi, OFFSET FLAT:.LC0
        xor     eax, eax
        call    printf
        jmp     .L10
.L35:
        mov     r14d, 9
.L12:
        lea     esi, [rbx-9]
        mov     edi, OFFSET FLAT:.LC0
        xor     eax, eax
        call    printf
        jmp     .L11
.L34:
        mov     r14d, 10
.L13:
        lea     esi, [rbx-10]
        mov     edi, OFFSET FLAT:.LC0
        xor     eax, eax
        call    printf
        jmp     .L12
.L33:
        mov     r14d, 11
.L14:
        lea     esi, [rbx-11]
        mov     edi, OFFSET FLAT:.LC0
        xor     eax, eax
        call    printf
        jmp     .L13

So it basically starts at the bottom of the code. And constantly jumps further back instead of running it in one go. From L14 it jumps to L13 then further back to L12,... Producing a jump fiesta.

Clang/ICX does it this way

        mov     rbx, rdi
        mov     rax, rdi
        dec     rax
        lea     rcx, [rip + .LJTI0_0]
        movsxd  rax, dword ptr [rcx + 4*rax]
        add     rax, rcx
        jmp     rax
.LBB0_1:
        mov     ebp, 1
        jmp     .LBB0_39
.LBB0_17:
        mov     ebp, 17
        jmp     .LBB0_23
.LBB0_15:
        mov     ebp, 15
        jmp     .LBB0_25
.LBB0_12:
        mov     ebp, 12
        jmp     .LBB0_28

.....

.LBB0_21:
        lea     esi, [rbx - 19]
        lea     rdi, [rip + .L.str]
        xor     eax, eax
        call    printf@PLT
.LBB0_22:
        lea     esi, [rbx - 18]
        lea     rdi, [rip + .L.str]
        xor     eax, eax
        call    printf@PLT
.LBB0_23:
        lea     esi, [rbx - 17]
        lea     rdi, [rip + .L.str]
        xor     eax, eax
        call    printf@PLT
.LBB0_24:
        lea     esi, [rbx - 16]
        lea     rdi, [rip + .L.str]
        xor     eax, eax
        call    printf@PLT

So both clang and icx produce a jump table to immediatelly resolve the index. From there they will then jump to the appropriate location.

Final question What is the reason for GCC to iterate the code the opposite way. Are there some other optimization parameters which forces GCC to do it in a more reasonable manner. Or am i mistaken and GCC is the optimal way?

C Switch/Case compiler produces jump fiesta (GCC very suboptimal)

Answers (0)

Related Questions