Reputation: 463
I've got this piece of code
#include <cstdlib>
#include <time.h>
int sum () {
srand (time(NULL));
unsigned long extra = rand() % 10;
int sum = 0;
// #pragma nounroll. <<<< This makes no difference
for (int i = 0; i < 16 + extra; ++i) {
sum += i;
}
return sum;
}
and with -O3
, clang optimized it to the following, which blew my mind. (Note how there's no branches whatsoever)
I really don't understand how the correctness of such optimization could be proved.
Specifically, the use of two seemingly magic numbers (which, btw, don't change between compiles) seems mystifying. Furthermore, I guess you call these "random" but not in spirit of the rand()
, no?
sum(): # @sum()
push rax
xor edi, edi
call time
mov edi, eax
call srand
call rand
cdqe
imul rcx, rax, 1717986919. # <<<< magic number
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
mov edx, eax
sub edx, ecx
neg ecx
add eax, ecx
add eax, 16
lea rcx, [rax - 1]
movabs rsi, 8589934590 # <<< magic number
add rsi, rax
imul rsi, rcx
shr rsi
lea eax, [rsi + rdx]
add eax, 15
pop rcx
ret
For posterity, gcc produced the following
sum():
sub rsp, 8
xor edi, edi
call time
mov rdi, rax
call srand
call rand
mov esi, 1
movsx rdx, eax
mov ecx, eax
imul rdx, rdx, 1717986919
sar ecx, 31
sar rdx, 34
sub edx, ecx
lea ecx, [rdx+rdx*4]
add ecx, ecx
sub eax, ecx
mov edx, eax
add eax, 16
movsx rcx, eax
cmp edx, -16
cmovne rsi, rcx
cmp eax, 18
jbe .L6
mov rdx, rsi
movdqa xmm1, XMMWORD PTR .LC0[rip]
pxor xmm0, xmm0
xor eax, eax
movdqa xmm3, XMMWORD PTR .LC1[rip]
shr rdx, 2
.L3:
movdqa xmm2, xmm1
add eax, 1
paddd xmm1, xmm3
paddd xmm0, xmm2
cmp eax, edx
jne .L3
movdqa xmm1, xmm0
mov rdi, rsi
psrldq xmm1, 8
and rdi, -4
paddd xmm0, xmm1
movsx rdx, edi
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
cmp rsi, rdi
je .L1
.L5:
add eax, edx
add rdx, 1
cmp rcx, rdx
ja .L5
.L1:
add rsp, 8
ret
.L6:
xor edx, edx
xor eax, eax
jmp .L5
.LC0:
.long 0
.long 1
.long 2
.long 3
.LC1:
.long 4
.long 4
.long 4
.long 4
Upvotes: 3
Views: 598
Reputation: 133919
The code does call rand
, which is sufficient. The return value will be held in the rax register. If you **divide 2³² by 1717986919 you get 2.499999999126885 which is quite close to 10 / 4... the constant is used, with shifts, to calculate the % 10
without having to use the expensive idiv
opcode.
After that, the result is just a sum of first n terms of arithmetic series for 1 + 2 + 3 ... + n, i.e. n(n + 1) / 2. The second magic number is related to this calculation.
Upvotes: 2