Damon
Damon

Reputation: 70126

Why is GCC std::atomic increment generating inefficient non-atomic assembly?

I've been using gcc's Intel-compatible builtins (like __sync_fetch_and_add) for quite some time, using my own atomic template. The "__sync" functions are now officially considered "legacy".

C++11 supports std::atomic<> and its descendants, so it seems reasonable to use that instead, since it makes my code standard compliant, and the compiler will produce the best code either way, in a platform independent manner, that is almost too good to be true.
Incidentally, I'd only have to text-replace atomic with std::atomic, too. There's a lot in std::atomic (re: memory models) that I don't really need, but default parameters take care of that.

Now for the bad news. As it turns out, the generated code is, from what I can tell, ... utter crap, and not even atomic at all. Even a minimum example that increments a single atomic variable and outputs it has no fewer than 5 non-inlined function calls to ___atomic_flag_for_address, ___atomic_flag_wait_explicit, and __atomic_flag_clear_explicit (fully optimized), and on the other hand, there is not a single atomic instruction in the generated executable.

What gives? There is of course always the possibility of a compiler bug, but with the huge number of reviewers and users, such rather drastic things are generally unlikely to go unnoticed. Which means, this is probably not a bug, but intended behaviour.

What is the "rationale" behind so many function calls, and how is atomicity implemented without atomicity?

As-simple-as-it-can-get example:

#include <atomic>

int main()
{
    std::atomic_int a(5);
    ++a;
    __builtin_printf("%d", (int)a);
    return 0;
}

produces the following .s:

movl    $5, 28(%esp)     #, a._M_i
movl    %eax, (%esp)     # tmp64,
call    ___atomic_flag_for_address   #
movl    $5, 4(%esp)  #,
movl    %eax, %ebx   #, __g
movl    %eax, (%esp)     # __g,
call    ___atomic_flag_wait_explicit     #
movl    %ebx, (%esp)     # __g,
addl    $1, 28(%esp)     #, MEM[(__i_type *)&a]
movl    $5, 4(%esp)  #,
call    _atomic_flag_clear_explicit  #
movl    %ebx, (%esp)     # __g,
movl    $5, 4(%esp)  #,
call    ___atomic_flag_wait_explicit     #
movl    28(%esp), %esi   # MEM[(const __i_type *)&a], __r
movl    %ebx, (%esp)     # __g,
movl    $5, 4(%esp)  #,
call    _atomic_flag_clear_explicit  #
movl    $LC0, (%esp)     #,
movl    %esi, 4(%esp)    # __r,
call    _printf  #
(...)
.def    ___atomic_flag_for_address; .scl    2;  .type   32; .endef
.def    ___atomic_flag_wait_explicit;   .scl    2;  .type   32; .endef
.def    _atomic_flag_clear_explicit;    .scl    2;  .type   32; .endef

... and the mentioned functions look e.g. like this in objdump:

004013c4 <__atomic_flag_for_address>:
mov    0x4(%esp),%edx
mov    %edx,%ecx
shr    $0x2,%ecx
mov    %edx,%eax
shl    $0x4,%eax
add    %ecx,%eax
add    %edx,%eax
mov    %eax,%ecx
shr    $0x7,%ecx
mov    %eax,%edx
shl    $0x5,%edx
add    %ecx,%edx
add    %edx,%eax
mov    %eax,%edx
shr    $0x11,%edx
add    %edx,%eax
and    $0xf,%eax
add    $0x405020,%eax
ret    

The others are somewhat simpler, but I don't find a single instruction that would really be atomic (other than some spurious xchg which are atomic on X86, but these seem to be rather NOP/padding, since it's xchg %ax,%ax following ret).

I'm absolutely not sure what such a rather complicated function is needed for, and how it's meant to make anything atomic.

Upvotes: 12

Views: 3491

Answers (2)

chill
chill

Reputation: 16888

It is an inadequate compiler build.

Check your c++config.h, it shoukld look like this, but it doesn't:

/* Define if builtin atomic operations for bool are supported on this host. */
#define _GLIBCXX_ATOMIC_BUILTINS_1 1

/* Define if builtin atomic operations for short are supported on this host.
   */
#define _GLIBCXX_ATOMIC_BUILTINS_2 1

/* Define if builtin atomic operations for int are supported on this host. */
#define _GLIBCXX_ATOMIC_BUILTINS_4 1

/* Define if builtin atomic operations for long long are supported on this
   host. */
#define _GLIBCXX_ATOMIC_BUILTINS_8 1

These macros are defined or not depending on configure tests, which check host machine support for __sync_XXX functions. These tests are in libstdc++v3/acinclude.m4, AC_DEFUN([GLIBCXX_ENABLE_ATOMIC_BUILTINS] ....

On your installation, it's evident from the MEM[(__i_type *)&a] put in the assembly file by -fverbose-asm that the compiler uses macros from atomic_0.h, for example:

#define _ATOMIC_LOAD_(__a, __x)                        \
  ({typedef __typeof__(_ATOMIC_MEMBER_) __i_type;                          \
    __i_type* __p = &_ATOMIC_MEMBER_;                      \
    __atomic_flag_base* __g = __atomic_flag_for_address(__p);          \
    __atomic_flag_wait_explicit(__g, __x);                 \
    __i_type __r = *__p;                           \
    atomic_flag_clear_explicit(__g, __x);                      \
    __r; })

With a properly built compiler, with your example program, c++ -m32 -std=c++0x -S -O2 -march=core2 -fverbose-asm should produce something like this:

movl    $5, 28(%esp)    #, a.D.5442._M_i
lock addl   $1, 28(%esp)    #,
mfence
movl    28(%esp), %eax  # MEM[(const struct __atomic_base *)&a].D.5442._M_i, __ret
mfence
movl    $.LC0, (%esp)   #,
movl    %eax, 4(%esp)   # __ret,
call    printf  #

Upvotes: 14

Jan Hudec
Jan Hudec

Reputation: 76256

There are two implementations. One that uses the __sync primitives and one that does not. Plus a mixture of the two that only uses some of those primitives. Which is selected depends on macros _GLIBCXX_ATOMIC_BUILTINS_1, _GLIBCXX_ATOMIC_BUILTINS_2, _GLIBCXX_ATOMIC_BUILTINS_4 and _GLIBCXX_ATOMIC_BUILTINS_8.

At least the first one is needed for the mixed implementation, all are needed for the fully atomic one. It seems that whether they are defined depends on target machine (they may not be defined for -mi386 and should be defined for -mi686).

Upvotes: 3

Related Questions