Reputation: 6751
I was wondering whether it is worth to aid the compiler with templates to unroll a simple loop. I prepared the following test:
#include <cstdlib>
#include <utility>
#include <array>
class TNode
{
public:
void Assemble();
void Assemble(TNode const *);
};
class T
{
private:
std::array<TNode *,3u> NodePtr;
private:
template <std::size_t,std::size_t>
void foo() const;
template <std::size_t... ij>
void foo(std::index_sequence<ij...>) const
{ (foo<ij%3u,ij/3u>(),...); }
public:
void foo() const
{ return foo(std::make_index_sequence<3u*3u>{}); }
void bar() const;
};
template <std::size_t i,std::size_t j>
inline void T::foo() const
{
if constexpr (i==j)
NodePtr[i]->Assemble();
else
NodePtr[i]->Assemble(NodePtr[j]);
}
inline void T::bar() const
{
for (std::size_t i= 0u; i<3u; ++i)
for (std::size_t j= 0u; j<3u; ++j)
if (i==j)
NodePtr[i]->Assemble();
else
NodePtr[i]->Assemble(NodePtr[j]);
}
void foo()
{
T x;
x.foo();
}
void bar()
{
T x;
x.bar();
}
I first tried with G++ with -O3 -funroll-loops
enabled and I got (https://godbolt.org/z/_Wyvl8):
foo():
push r12
push rbp
push rbx
sub rsp, 32
mov r12, QWORD PTR [rsp]
mov rdi, r12
call TNode::Assemble()
mov rbp, QWORD PTR [rsp+8]
mov rsi, r12
mov rdi, rbp
call TNode::Assemble(TNode const*)
mov rbx, QWORD PTR [rsp+16]
mov rsi, r12
mov rdi, rbx
call TNode::Assemble(TNode const*)
mov rsi, rbp
mov rdi, r12
call TNode::Assemble(TNode const*)
mov rdi, rbp
call TNode::Assemble()
mov rsi, rbp
mov rdi, rbx
call TNode::Assemble(TNode const*)
mov rsi, rbx
mov rdi, r12
call TNode::Assemble(TNode const*)
mov rdi, rbp
mov rsi, rbx
call TNode::Assemble(TNode const*)
add rsp, 32
mov rdi, rbx
pop rbx
pop rbp
pop r12
jmp TNode::Assemble()
bar():
push r13
push r12
push rbp
xor ebp, ebp
push rbx
sub rsp, 40
.L9:
mov r13, QWORD PTR [rsp+rbp*8]
xor ebx, ebx
lea r12, [rbp+1]
.L5:
cmp rbp, rbx
je .L15
mov rsi, QWORD PTR [rsp+rbx*8]
mov rdi, r13
add rbx, 1
call TNode::Assemble(TNode const*)
cmp rbx, 3
jne .L5
mov rbp, r12
cmp r12, 3
jne .L9
.L16:
add rsp, 40
pop rbx
pop rbp
pop r12
pop r13
ret
.L15:
mov rdi, r13
mov rbx, r12
call TNode::Assemble()
cmp r12, 3
jne .L5
mov rbp, r12
cmp r12, 3
jne .L9
jmp .L16
I can’t read assembly, but I seem to understand that the templated version does unroll the loop, while bar
has loops and branches.
Then I tried with Clang++ (https://godbolt.org/z/VCNb65) and I got a very different picture:
foo(): # @foo()
push rax
call TNode::Assemble()
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
call TNode::Assemble()
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
pop rax
jmp TNode::Assemble() # TAILCALL
bar(): # @bar()
push rax
call TNode::Assemble()
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
call TNode::Assemble()
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
call TNode::Assemble(TNode const*)
pop rax
jmp TNode::Assemble() # TAILCALL
What happened here? How can the resulting assembly be so terse?
Upvotes: 5
Views: 486
Reputation: 26342
NodePtr
is not initialized, and when you use it, it is UB. So the optimizer can do whatever it wants: here it decides to omit assignments to the register esi/rsi
, which is used to pass an argument to TNode::Assemble(TNode const*)
, and to edi/rdi
, which holds an object pointer (this
). As a result, you see only a bunch of call
instructions.
Try to value-initialize x
(this will zero-initialize NodePtr
),
T x{};
and you'll get much more meaningful assembly.
Clang seems to be better at loop unrolling. See, e.g., this answer. It is up to you to decide whether loops are worth unrolling. For small loops, probably, they are. But you should measure.
Upvotes: 3