WiSaGaN
WiSaGaN

Reputation: 48127

Can Rust optimise away the bit-wise copy during move of an object someday?

Consider the snippet

struct Foo {
    dummy: [u8; 65536],
}

fn bar(foo: Foo) {
    println!("{:p}", &foo)
}

fn main() {
    let o = Foo { dummy: [42u8; 65536] };
    println!("{:p}", &o);
    bar(o);
}

A typical result of the program is

0x7fffc1239890
0x7fffc1229890

where the addresses are different.

Apparently, the large array dummy has been copied, as expected in the compiler's move implementation. Unfortunately, this can have non-trivial performance impact, as dummy is a very large array. This impact can force people to choose passing argument by reference instead, even when the function actually "consumes" the argument conceptually.

Since Foo does not derive Copy, object o is moved. Since Rust forbids the access of moved object, what is preventing bar to "reuse" the original object o, forcing the compiler to generate a potentially expensive bit-wise copy? Is there a fundamental difficulty, or will we see the compiler someday optimise away this bit-wise copy?

Upvotes: 30

Views: 4448

Answers (2)

m4tx
m4tx

Reputation: 4531

State as of 2024; the state of the issue could've been different in 2016, when the question was asked

There is actually nothing preventing the compiler from optimizing away array copies like this. The problem is that it seems to be very cautious about passing values into things like println!, since they actually always receive references to these values and could potentially depend on the exact address of these values for some reason. This is exactly the case in your example: you even explicitly ask for the addresses, so the compiler is smart enough not to optimize these away — after all, you asked for a copy.

Let's have a look at slightly simplified example from your question:

fn bar(foo: [u8; 65536]) {
    println!("{}", foo[123]);
}

pub fn main() {
    let o = [42u8; 65536];
    println!("{}", o[123]);
    bar(o);
}

You can run it through the Compiler Explorer with -C opt-level=2 and see the following assembly code:

example::main::hd2bfa2df25bfe7d7:
        push    r15
        push    r14
        push    r12
        push    rbx
        mov     r11, rsp
        sub     r11, 131072
.LBB0_1:
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        cmp     rsp, r11
        jne     .LBB0_1
        sub     rsp, 72
        lea     rbx, [rsp + 65608]
        mov     edx, 65536
        mov     rdi, rbx
        mov     esi, 42
        call    qword ptr [rip + memset@GOTPCREL]
        lea     rax, [rsp + 65731]
        mov     qword ptr [rsp + 8], rax
        mov     r14, qword ptr [rip + core::fmt::num::imp::<impl core::fmt::Display for u8>::fmt::ha81407c30cb780ca@GOTPCREL]
        mov     qword ptr [rsp + 16], r14
        lea     r15, [rip + .L__unnamed_1]
        mov     qword ptr [rsp + 72], r15
        mov     qword ptr [rsp + 80], 2
        mov     qword ptr [rsp + 104], 0
        lea     rax, [rsp + 8]
        mov     qword ptr [rsp + 88], rax
        mov     qword ptr [rsp + 96], 1
        mov     r12, qword ptr [rip + std::io::stdio::_print::hd6837e34a66547dd@GOTPCREL]
        lea     rdi, [rsp + 72]
        call    r12
        lea     rdi, [rsp + 72]
        mov     edx, 65536
        mov     rsi, rbx
        call    qword ptr [rip + memcpy@GOTPCREL]
        lea     rax, [rsp + 195]
        mov     qword ptr [rsp + 56], rax
        mov     qword ptr [rsp + 64], r14
        mov     qword ptr [rsp + 8], r15
        mov     qword ptr [rsp + 16], 2
        mov     qword ptr [rsp + 40], 0
        lea     rax, [rsp + 56]
        mov     qword ptr [rsp + 24], rax
        mov     qword ptr [rsp + 32], 1
        lea     rdi, [rsp + 8]
        call    r12
        add     rsp, 131144
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        ret

Most of the code isn't really relevant here, but the important bit is that it's calling memset (to create a 65536-byte long array with all the values set to 42) and then memcpy before calling bar(). If you see closely, you'll notice bar has even been inlined here, and yet the array is still being copied.

Let's now change the example once again to get rid of any direct uses of the array in the println! statement:

fn bar(foo: [u8; 65536]) {
    let val = foo[123];
    println!("{}", val);
}

pub fn main() {
    let o = [42u8; 65536];
    let val = o[123];
    println!("{}", val);
    bar(o);
}

Compiler Explorer

And now the assembly code looks completely different:

example::main::hd2bfa2df25bfe7d7:
        push    r15
        push    r14
        push    r12
        push    rbx
        sub     rsp, 72
        mov     byte ptr [rsp + 6], 42
        lea     rax, [rsp + 6]
        mov     qword ptr [rsp + 8], rax
        mov     rbx, qword ptr [rip + core::fmt::num::imp::<impl core::fmt::Display for u8>::fmt::ha81407c30cb780ca@GOTPCREL]
        mov     qword ptr [rsp + 16], rbx
        lea     r14, [rip + .L__unnamed_1]
        mov     qword ptr [rsp + 24], r14
        mov     qword ptr [rsp + 32], 2
        mov     qword ptr [rsp + 56], 0
        lea     r15, [rsp + 8]
        mov     qword ptr [rsp + 40], r15
        mov     qword ptr [rsp + 48], 1
        mov     r12, qword ptr [rip + std::io::stdio::_print::hd6837e34a66547dd@GOTPCREL]
        lea     rdi, [rsp + 24]
        call    r12
        mov     byte ptr [rsp + 7], 42
        lea     rax, [rsp + 7]
        mov     qword ptr [rsp + 8], rax
        mov     qword ptr [rsp + 16], rbx
        mov     qword ptr [rsp + 24], r14
        mov     qword ptr [rsp + 32], 2
        mov     qword ptr [rsp + 56], 0
        mov     qword ptr [rsp + 40], r15
        mov     qword ptr [rsp + 48], 1
        lea     rdi, [rsp + 24]
        call    r12
        add     rsp, 72
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        ret

No traces of memset, memcpy, or other big-chunk-of-memory manipulation. The values have been computed in compile time and passed directly into two calls of std::io::stdio::_print(). Of course, this example is oversimplified, but the point is that the compiler can optimize such stuff away.

The last example:

#[inline(never)]
fn bar(foo: [u8; 65536]) {
    let val = foo[123];
    println!("{}", val);
}

pub fn main() {
    let o = [42u8; 65536];
    bar(o);
}

Compiler Explorer

Notice the #[inline(never)] at fn bar()! This compiles into the following assembly code:

example::bar::hd9b3b9e4606cd696:
        sub     rsp, 72
        movzx   eax, byte ptr [rdi + 123]
        mov     byte ptr [rsp + 7], al
        lea     rax, [rsp + 7]
        mov     qword ptr [rsp + 8], rax
        mov     rax, qword ptr [rip + core::fmt::num::imp::<impl core::fmt::Display for u8>::fmt::ha81407c30cb780ca@GOTPCREL]
        mov     qword ptr [rsp + 16], rax
        lea     rax, [rip + .L__unnamed_1]
        mov     qword ptr [rsp + 24], rax
        mov     qword ptr [rsp + 32], 2
        mov     qword ptr [rsp + 56], 0
        lea     rax, [rsp + 8]
        mov     qword ptr [rsp + 40], rax
        mov     qword ptr [rsp + 48], 1
        lea     rdi, [rsp + 24]
        call    qword ptr [rip + std::io::stdio::_print::hd6837e34a66547dd@GOTPCREL]
        add     rsp, 72
        ret

example::main::hd2bfa2df25bfe7d7:
        push    rbx
        mov     r11, rsp
        sub     r11, 65536
.LBB1_1:
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        cmp     rsp, r11
        jne     .LBB1_1
        mov     rbx, rsp
        mov     edx, 65536
        mov     rdi, rbx
        mov     esi, 42
        call    qword ptr [rip + memset@GOTPCREL]
        mov     rdi, rbx
        call    example::bar::hd9b3b9e4606cd696
        add     rsp, 65536
        pop     rbx
        ret

Here you can see that even though the main function creates the actual array, it's not being copied into bar(). Instead, its address is just kept in the rbx register and then just passed as a pointer in rdi to the bar function. Zero copies are made, we're all good!

Upvotes: 2

Matthieu M.
Matthieu M.

Reputation: 300409

Given that in Rust (unlike C or C++) the address of a value is not considered to matter, there is nothing in terms of language that prevents the elision of the copy.

However, today rustc does not optimize anything: all optimizations are delegated to LLVM, and it seems you have hit a limitation of the LLVM optimizer here (it's unclear whether this limitation is due to LLVM being close to C's semantics or is just an omission).

So, there are two avenues of improving code generation for this:

  • teaching LLVM to perform this optimization (if possible)
  • teaching rustc to perform this optimization (optimization passes are coming to rustc now that it has MIR)

but for now you might simply want to avoid such large objects from being allocated on the stack, you can Box it for example.

Upvotes: 27

Related Questions