Reputation: 2044
I am learning about rust and asm, and using godbolt for this.
I have a program that looks like:
pub fn test() -> i32 {
let a = 1;
let b = 2;
let c = 3;
a + b + c
}
And I would expect the output to look something like
example::test:
subq $16, %rsp
movl $1, (%rsp)
movl $2, 4(%rsp)
movl $3, 8(%rsp)
movl (%rsp), %eax
addl 4(%rsp), %eax
addl 8(%rsp), %eax
addq $16, %rsp
retq
But I actually get:
example::test:
mov eax, 6
ret
This is useless when trying to demonstrate stack allocation, addition etc.
I am using the compiler flags: -Z mir-opt-level=0 -C opt-level=0 -C overflow-checks=off
So the MIR isn't optimising away the additions. The MIR output is:
// WARNING: This output format is intended for human consumers only
// and is subject to change without notice. Knock yourself out.
fn test() -> i32 {
let mut _0: i32; // return place in scope 0 at /app/example.rs:2:18: 2:21
let _1: i32; // in scope 0 at /app/example.rs:3:9: 3:10
let mut _4: i32; // in scope 0 at /app/example.rs:6:5: 6:10
let mut _5: i32; // in scope 0 at /app/example.rs:6:5: 6:6
let mut _6: i32; // in scope 0 at /app/example.rs:6:9: 6:10
let mut _7: i32; // in scope 0 at /app/example.rs:6:13: 6:14
scope 1 {
debug a => _1; // in scope 1 at /app/example.rs:3:9: 3:10
let _2: i32; // in scope 1 at /app/example.rs:4:9: 4:10
scope 2 {
debug b => _2; // in scope 2 at /app/example.rs:4:9: 4:10
let _3: i32; // in scope 2 at /app/example.rs:5:9: 5:10
scope 3 {
debug c => _3; // in scope 3 at /app/example.rs:5:9: 5:10
}
}
}
bb0: {
StorageLive(_1); // scope 0 at /app/example.rs:3:9: 3:10
_1 = const 1_i32; // scope 0 at /app/example.rs:3:13: 3:14
StorageLive(_2); // scope 1 at /app/example.rs:4:9: 4:10
_2 = const 2_i32; // scope 1 at /app/example.rs:4:13: 4:14
StorageLive(_3); // scope 2 at /app/example.rs:5:9: 5:10
_3 = const 3_i32; // scope 2 at /app/example.rs:5:13: 5:14
StorageLive(_4); // scope 3 at /app/example.rs:6:5: 6:10
StorageLive(_5); // scope 3 at /app/example.rs:6:5: 6:6
_5 = _1; // scope 3 at /app/example.rs:6:5: 6:6
StorageLive(_6); // scope 3 at /app/example.rs:6:9: 6:10
_6 = _2; // scope 3 at /app/example.rs:6:9: 6:10
_4 = Add(move _5, move _6); // scope 3 at /app/example.rs:6:5: 6:10
StorageDead(_6); // scope 3 at /app/example.rs:6:9: 6:10
StorageDead(_5); // scope 3 at /app/example.rs:6:9: 6:10
StorageLive(_7); // scope 3 at /app/example.rs:6:13: 6:14
_7 = _3; // scope 3 at /app/example.rs:6:13: 6:14
_0 = Add(move _4, move _7); // scope 3 at /app/example.rs:6:5: 6:14
StorageDead(_7); // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_4); // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_3); // scope 2 at /app/example.rs:7:1: 7:2
StorageDead(_2); // scope 1 at /app/example.rs:7:1: 7:2
StorageDead(_1); // scope 0 at /app/example.rs:7:1: 7:2
return; // scope 0 at /app/example.rs:7:2: 7:2
}
}
And the LLVM IR output is:
define i32 @_ZN7example4test17h2e9277ab15e59fbdE() unnamed_addr #0 !dbg !5 {
start:
ret i32 6, !dbg !10
}
attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
So it is at the MIR->LLVM level when the additions are optimised out.
How can I prevent this?
Thanks!
Note
If I use a tuple, the optimisation doesn't happen. e.g
pub fn test() -> i32 {
let a = (1,2,3);
a.0 + a.1 + a.2
}
becomes:
example::test:
subq $16, %rsp
movl $1, (%rsp)
movl $2, 4(%rsp)
movl $3, 8(%rsp)
movl (%rsp), %eax
addl 4(%rsp), %eax
addl 8(%rsp), %eax
addq $16, %rsp
retq
Upvotes: 1
Views: 972
Reputation: 365217
Pass mutable references to them to some other function (or inline asm) to force them to have memory addresses. One hack for declaring a function without defining it is extern "C"
.
extern "C" {
fn ext(x: &i32); // void ext(const int *x);
}
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
unsafe{ ext(&b); }
//dummy(&c, &a); // alternative, declare as non-inline an use std::hint::black_box
a + b + c
}
With -C opt-level=0 -C overflow-checks=off
on Godbolt, the compiler spills both args to memory around the function call.
example::test:
push rax // align the stack and reserve 8 bytes
mov dword ptr [rsp], edi
mov dword ptr [rsp + 4], esi
lea rdi, [rsp + 4] // &b
call qword ptr [rip + ext@GOTPCREL] // function call -fno-plt
style
mov eax, dword ptr [rsp] // reload a and b
add eax, dword ptr [rsp + 4]
add eax, 3 // constant-propagation for c
pop rcx // dealloc stack space with a dummy pop
ret
Without disabling optimization, LLVM as expected saves/restores a call-preserved register to keep a
across the function call.
example::test:
push rbx // save a call-preserved reg
sub rsp, 16
mov ebx, edi // use it to hold a
mov dword ptr [rsp + 12], esi // spill b
lea rdi, [rsp + 12] // and pass a pointer to it
call qword ptr [rip + ext@GOTPCREL]
mov eax, dword ptr [rsp + 12]
add eax, ebx
add eax, 3
add rsp, 16 // epilogue
pop rbx
ret
Or to block constant-folding specifically, use function args instead of constants. As in: How to remove "noise" from GCC/clang assembly output?
pub fn test2(a: i32, b: i32) -> i32 {
let c = 3;
a + b + c
}
But even with -C opt-level=0 -C overflow-checks=off
on Godbolt,
Rustc still doesn't spill/reload to stack space like clang -O0 would.
example::test2:
mov eax, edi
add eax, esi
add eax, 3
ret
(opt-level=3
of course uses an LEA instead of MOV+ADD, but still uses a separate add of the constant 3 to optimize for latency instead of throughput on CPUs like Skylake where a 3-component LEA has 3 cycle latency instead of 1. Unlike Alder Lake where lea eax, [rsi+rdi+3]
is 1 cycle latency, and would be 2 with a scaled index. Or 2 cycles for that on Zen or Alder Lake E-cores, so break-even with separate LEA/ADD but fewer uops. https://uops.info/)
#[inline(never)]
This was suggested on How to declare a function without implementing it? as a way to get a non-inline function call. We can use std::hint::black_box
as suggested by @Finomnis to actually use the args and force the caller to materialize a value in memory when it passes a reference.
Uncomment it on the Godbolt link above to try it out.
#![feature(bench_black_box)]
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
dummy(&c, &a);
a + b + c
}
#[inline(never)]
pub extern fn dummy(_a: &i32, _b: &i32) {
//use std::sync::atomic::*;
//compiler_fence(Ordering::Release); // make the function non-empty even without args
std::hint::black_box(_a);
std::hint::black_box(_b);
}
Upvotes: 1
Reputation: 22728
There is the black_box
hint that prevents the computation from happening at compile time.
Note that it is only available on nightly, at the time of writing.
#![feature(bench_black_box)]
pub fn test() -> i32 {
let a = std::hint::black_box(1);
let b = std::hint::black_box(2);
let c = std::hint::black_box(3);
a + b + c
}
example::test:
sub rsp, 12
mov dword ptr [rsp], 1
mov rax, rsp
mov eax, dword ptr [rsp]
mov dword ptr [rsp + 4], 2
lea rcx, [rsp + 4]
add eax, dword ptr [rsp + 4]
mov dword ptr [rsp + 8], 3
lea rcx, [rsp + 8]
add eax, dword ptr [rsp + 8]
add rsp, 12
ret
Compiled with rust nightly
and -C opt-level=3
.
https://rust.godbolt.org/z/rMWhao11W
Upvotes: 2