David
David

Reputation: 10162

Slow std::string concatenation on windows

I have a program that needs to concatenate lots of strings together (to be more precise integers converted to strings). On my Ubuntu machine (running g++ 7.3.0) the code runs in 1.5 seconds. But the code needs to be run on Windows as well (running g++ 6.3.0 using MinGW), where it takes 15 seconds to complete. Furthermore, the Ubuntu setup runs on a much slower Laptop using an i7-4712MQ CPU @ 2.30GHz, whereas the Windows machine runs on an i7-7700K CPU @ 4.20GHz.

The code to reproduce the times is shown below. I compile the code with g++ tester.cpp -O2 -o tester (or tester.exe for windows)

#include <iostream>
#include <chrono>

int main(int argc, char const *argv[]) {

    auto started = std::chrono::high_resolution_clock::now();
    std::string str = "";
    const int n = 10000000;
    str.reserve(2 * n);
    int a = 1;

    for (int i = 0; i < n; ++i) {
        str += std::to_string(a) + " ";
    }

    auto done = std::chrono::high_resolution_clock::now();
    double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
    std::cout << "Done in " << secs << "\n";
    return 0;
}

Any idea where the large performance gap might come from?

The disassemblies look like this:

Ubuntu:

.file   "tester.cpp"


.text
    .align 2
    .p2align 4,,15
    .type   _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, @function
_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19:
.LFB2389:
    .cfi_startproc
    pushq   %r12
    .cfi_def_cfa_offset 16
    .cfi_offset 12, -16
    pushq   %rbp
    .cfi_def_cfa_offset 24
    .cfi_offset 6, -24
    movq    %rsi, %r12
    pushq   %rbx
    .cfi_def_cfa_offset 32
    .cfi_offset 3, -32
    movq    %rdx, %rbx
    movq    %rdi, %rbp
    subq    %rsi, %rbx
    subq    $16, %rsp
    .cfi_def_cfa_offset 48
    movq    %fs:40, %rax
    movq    %rax, 8(%rsp)
    xorl    %eax, %eax
    cmpq    $15, %rbx
    movq    %rbx, (%rsp)
    ja  .L12
    movq    (%rdi), %rdx
    cmpq    $1, %rbx
    movq    %rdx, %rax
    jne .L4
    movzbl  (%rsi), %eax
    movb    %al, (%rdx)
    movq    (%rdi), %rdx
.L5:
    movq    (%rsp), %rax
    movq    %rax, 8(%rbp)
    movb    $0, (%rdx,%rax)
    movq    8(%rsp), %rax
    xorq    %fs:40, %rax
    jne .L13
    addq    $16, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 32
    popq    %rbx
    .cfi_def_cfa_offset 24
    popq    %rbp
    .cfi_def_cfa_offset 16
    popq    %r12
    .cfi_def_cfa_offset 8
    ret
.L12:
    .cfi_restore_state
    xorl    %edx, %edx
    movq    %rsp, %rsi
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm@PLT
    movq    (%rsp), %rdx
    movq    %rax, 0(%rbp)
    movq    %rdx, 16(%rbp)
.L3:
    movq    %rbx, %rdx
    movq    %r12, %rsi
    movq    %rax, %rdi
    call    memcpy@PLT
    movq    0(%rbp), %rdx
    jmp .L5
.L4:
    testq   %rbx, %rbx
    je  .L5
    jmp .L3
.L13:
    call    __stack_chk_fail@PLT
    .cfi_endproc
.LFE2389:
    .size   _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, .-_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
    .set    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23,_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
    .section    .text._ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,"axG",@progbits,_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,comdat
    .p2align 4,,15
    .weak   _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
    .type   _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, @function
_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z:
.LFB1953:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsi, %r10
    movq    %rdx, %rsi
    movq    %rcx, %rdx
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %r12
    pushq   %rbx
    .cfi_offset 12, -24
    .cfi_offset 3, -32
    movq    %rdi, %r12
    subq    $208, %rsp
    testb   %al, %al
    movq    %r8, -160(%rbp)
    movq    %r9, -152(%rbp)
    je  .L15
    movaps  %xmm0, -144(%rbp)
    movaps  %xmm1, -128(%rbp)
    movaps  %xmm2, -112(%rbp)
    movaps  %xmm3, -96(%rbp)
    movaps  %xmm4, -80(%rbp)
    movaps  %xmm5, -64(%rbp)
    movaps  %xmm6, -48(%rbp)
    movaps  %xmm7, -32(%rbp)
.L15:
    movq    %fs:40, %rax
    movq    %rax, -200(%rbp)
    xorl    %eax, %eax
    leaq    30(%rsi), %rax
    leaq    -224(%rbp), %rcx
    andq    $-16, %rax
    movl    $32, -224(%rbp)
    movl    $48, -220(%rbp)
    subq    %rax, %rsp
    leaq    16(%rbp), %rax
    leaq    15(%rsp), %rbx
    movq    %rax, -216(%rbp)
    leaq    -192(%rbp), %rax
    andq    $-16, %rbx
    movq    %rbx, %rdi
    movq    %rax, -208(%rbp)
    call    *%r10
    leaq    16(%r12), %rdx
    movq    %r12, %rdi
    movq    %rbx, %rsi
    movq    %rdx, (%r12)
    movslq  %eax, %rdx
    addq    %rbx, %rdx
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23
    movq    -200(%rbp), %rdi
    xorq    %fs:40, %rdi
    movq    %r12, %rax
    jne .L18
    leaq    -16(%rbp), %rsp
    popq    %rbx
    popq    %r12
    popq    %rbp
    .cfi_remember_state
    .cfi_def_cfa 7, 8
    ret
.L18:
    .cfi_restore_state
    call    __stack_chk_fail@PLT
    .cfi_endproc
.LFE1953:
    .size   _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, .-_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string ""
.LC1:
    .string "%d"
.LC2:
    .string "basic_string::append"
.LC3:
    .string " "
.LC5:
    .string "Done in "
.LC6:
    .string "\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB1871:
    .cfi_startproc
    .cfi_personality 0x9b,DW.ref.__gxx_personality_v0
    .cfi_lsda 0x1b,.LLSDA1871
    pushq   %r15
    .cfi_def_cfa_offset 16
    .cfi_offset 15, -16
    pushq   %r14
    .cfi_def_cfa_offset 24
    .cfi_offset 14, -24
    pushq   %r13
    .cfi_def_cfa_offset 32
    .cfi_offset 13, -32
    pushq   %r12
    .cfi_def_cfa_offset 40
    .cfi_offset 12, -40
    pushq   %rbp
    .cfi_def_cfa_offset 48
    .cfi_offset 6, -48
    pushq   %rbx
    .cfi_def_cfa_offset 56
    .cfi_offset 3, -56
    subq    $136, %rsp
    .cfi_def_cfa_offset 192
    leaq    16(%rsp), %r13
    movq    %fs:40, %rax
    movq    %rax, 120(%rsp)
    xorl    %eax, %eax
    call    _ZNSt6chrono3_V212system_clock3nowEv@PLT
    leaq    .LC0(%rip), %rdx
    movq    %rax, (%rsp)
    leaq    16(%r13), %rax
    movq    %r13, %rdi
    movq    %rdx, %rsi
    movq    %rax, 16(%rsp)
.LEHB0:
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.LEHE0:
    movl    $20000000, %esi
    movq    %r13, %rdi
.LEHB1:
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm@PLT
.LEHE1:
    leaq    48(%rsp), %rbp
    leaq    80(%rsp), %rax
    movl    $10000000, %ebx
    movabsq $9223372036854775807, %r14
    leaq    96(%rsp), %r12
    movq    %rax, 8(%rsp)
    leaq    16(%rbp), %r15
    jmp .L25
    .p2align 4,,10
    .p2align 3
.L21:
    movq    %rcx, 80(%rsp)
    movq    16(%rax), %rcx
    movq    %rcx, 96(%rsp)
.L22:
    movq    8(%rax), %rcx
    movb    $0, 16(%rax)
    movq    %r13, %rdi
    movq    %rcx, 88(%rsp)
    movq    %rdx, (%rax)
    movq    $0, 8(%rax)
    movq    80(%rsp), %rsi
    movq    88(%rsp), %rdx
.LEHB2:
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm@PLT
.LEHE2:
    movq    80(%rsp), %rdi
    cmpq    %r12, %rdi
    je  .L23
    call    _ZdlPv@PLT
.L23:
    movq    48(%rsp), %rdi
    cmpq    %r15, %rdi
    je  .L24
    call    _ZdlPv@PLT
.L24:
    subl    $1, %ebx
    je  .L40
.L25:
    movq    vsnprintf@GOTPCREL(%rip), %rsi
    leaq    .LC1(%rip), %rcx
    movl    $1, %r8d
    movl    $16, %edx
    movq    %rbp, %rdi
    xorl    %eax, %eax
.LEHB3:
    call    _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.LEHE3:
    cmpq    %r14, 56(%rsp)
    je  .L41
    leaq    .LC3(%rip), %rsi
    movl    $1, %edx
    movq    %rbp, %rdi
.LEHB4:
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm@PLT
.LEHE4:
    movq    %r12, 80(%rsp)
    movq    (%rax), %rcx
    leaq    16(%rax), %rdx
    cmpq    %rdx, %rcx
    jne .L21
    movdqu  16(%rax), %xmm0
    movaps  %xmm0, 96(%rsp)
    jmp .L22
    .p2align 4,,10
    .p2align 3
.L40:
    call    _ZNSt6chrono3_V212system_clock3nowEv@PLT
    subq    (%rsp), %rax
    movabsq $4835703278458516699, %rdx
    leaq    .LC5(%rip), %rsi
    pxor    %xmm0, %xmm0
    leaq    _ZSt4cout(%rip), %rdi
    movq    %rax, %rcx
    imulq   %rdx
    sarq    $63, %rcx
    sarq    $18, %rdx
    subq    %rcx, %rdx
    cvtsi2sdq   %rdx, %xmm0
    movl    $8, %edx
    divsd   .LC4(%rip), %xmm0
    movsd   %xmm0, (%rsp)
.LEHB5:
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l@PLT
    movsd   (%rsp), %xmm0
    leaq    _ZSt4cout(%rip), %rdi
    call    _ZNSo9_M_insertIdEERSoT_@PLT
    leaq    .LC6(%rip), %rsi
    movq    %rax, %rdi
    call    _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT
.LEHE5:
    movq    16(%rsp), %rdi
    addq    $16, %r13
    cmpq    %r13, %rdi
    je  .L26
    call    _ZdlPv@PLT
.L26:
    xorl    %eax, %eax
    movq    120(%rsp), %rbx
    xorq    %fs:40, %rbx
    jne .L42
    addq    $136, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 56
    popq    %rbx
    .cfi_def_cfa_offset 48
    popq    %rbp
    .cfi_def_cfa_offset 40
    popq    %r12
    .cfi_def_cfa_offset 32
    popq    %r13
    .cfi_def_cfa_offset 24
    popq    %r14
    .cfi_def_cfa_offset 16
    popq    %r15
    .cfi_def_cfa_offset 8
    ret
.L41:
    .cfi_restore_state
    leaq    .LC2(%rip), %rdi
.LEHB6:
    call    _ZSt20__throw_length_errorPKc@PLT
.LEHE6:
.L35:
    movq    %rax, %rbx
.L29:
    movq    48(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  .L31
    call    _ZdlPv@PLT
.L31:
    movq    16(%rsp), %rdi
    addq    $16, %r13
    cmpq    %r13, %rdi
    je  .L32
    call    _ZdlPv@PLT
.L32:
    movq    %rbx, %rdi
.LEHB7:
    call    _Unwind_Resume@PLT
.LEHE7:
.L34:
    movq    %rax, %rbx
    jmp .L31
.L36:
    movq    8(%rsp), %rdx
    movq    80(%rsp), %rdi
    movq    %rax, %rbx
    addq    $16, %rdx
    cmpq    %rdx, %rdi
    je  .L29
    call    _ZdlPv@PLT
    jmp .L29
.L42:
    call    __stack_chk_fail@PLT
    .cfi_endproc
.LFE1871:
    .globl  __gxx_personality_v0
    .section    .gcc_except_table,"a",@progbits
.LLSDA1871:
    .byte   0xff
    .byte   0xff
    .byte   0x1
    .uleb128 .LLSDACSE1871-.LLSDACSB1871
.LLSDACSB1871:
    .uleb128 .LEHB0-.LFB1871
    .uleb128 .LEHE0-.LEHB0
    .uleb128 0
    .uleb128 0
    .uleb128 .LEHB1-.LFB1871
    .uleb128 .LEHE1-.LEHB1
    .uleb128 .L34-.LFB1871
    .uleb128 0
    .uleb128 .LEHB2-.LFB1871
    .uleb128 .LEHE2-.LEHB2
    .uleb128 .L36-.LFB1871
    .uleb128 0
    .uleb128 .LEHB3-.LFB1871
    .uleb128 .LEHE3-.LEHB3
    .uleb128 .L34-.LFB1871
    .uleb128 0
    .uleb128 .LEHB4-.LFB1871
    .uleb128 .LEHE4-.LEHB4
    .uleb128 .L35-.LFB1871
    .uleb128 0
    .uleb128 .LEHB5-.LFB1871
    .uleb128 .LEHE5-.LEHB5
    .uleb128 .L34-.LFB1871
    .uleb128 0
    .uleb128 .LEHB6-.LFB1871
    .uleb128 .LEHE6-.LEHB6
    .uleb128 .L35-.LFB1871
    .uleb128 0
    .uleb128 .LEHB7-.LFB1871
    .uleb128 .LEHE7-.LEHB7
    .uleb128 0
    .uleb128 0
.LLSDACSE1871:
    .section    .text.startup
    .size   main, .-main
    .p2align 4,,15
    .type   _GLOBAL__sub_I_main, @function
_GLOBAL__sub_I_main:
.LFB2369:
    .cfi_startproc
    leaq    _ZStL8__ioinit(%rip), %rdi
    subq    $8, %rsp
    .cfi_def_cfa_offset 16
    call    _ZNSt8ios_base4InitC1Ev@PLT
    movq    _ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rdi
    leaq    __dso_handle(%rip), %rdx
    leaq    _ZStL8__ioinit(%rip), %rsi
    addq    $8, %rsp
    .cfi_def_cfa_offset 8
    jmp __cxa_atexit@PLT
    .cfi_endproc
.LFE2369:
    .size   _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
    .section    .init_array,"aw"
    .align 8
    .quad   _GLOBAL__sub_I_main
    .local  _ZStL8__ioinit
    .comm   _ZStL8__ioinit,1,1
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC4:
    .long   0
    .long   1083129856
    .hidden DW.ref.__gxx_personality_v0
    .weak   DW.ref.__gxx_personality_v0
    .section    .data.DW.ref.__gxx_personality_v0,"awG",@progbits,DW.ref.__gxx_personality_v0,comdat
    .align 8
    .type   DW.ref.__gxx_personality_v0, @object
    .size   DW.ref.__gxx_personality_v0, 8
DW.ref.__gxx_personality_v0:
    .quad   __gxx_personality_v0
    .hidden __dso_handle
    .ident  "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
    .section    .note.GNU-stack,"",@progbits

Windows:

.file   "tester.cpp"
    .text
    .p2align 4,,15
    .def    ___tcf_0;   .scl    3;  .type   32; .endef
___tcf_0:
LFB2556:
    .cfi_startproc
    movl    $__ZStL8__ioinit, %ecx
    jmp __ZNSt8ios_base4InitD1Ev
    .cfi_endproc
LFE2556:
    .section .rdata,"dr"
    .align 4
LC0:
    .ascii "basic_string::_M_construct null not valid\0"
    .text
    .align 2
    .p2align 4,,15
    .def    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29;    .scl    3;  .type   32; .endef
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29:
LFB2587:
    .cfi_startproc
    pushl   %edi
    .cfi_def_cfa_offset 8
    .cfi_offset 7, -8
    pushl   %esi
    .cfi_def_cfa_offset 12
    .cfi_offset 6, -12
    movl    %ecx, %esi
    pushl   %ebx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    subl    $32, %esp
    .cfi_def_cfa_offset 48
    movl    48(%esp), %edi
    movl    52(%esp), %ebx
    testl   %edi, %edi
    jne L5
    testl   %ebx, %ebx
    je  L5
    movl    $LC0, (%esp)
    call    __ZSt19__throw_logic_errorPKc
    .p2align 4,,10
L5:
    subl    %edi, %ebx
    cmpl    $15, %ebx
    movl    %ebx, 28(%esp)
    ja  L22
    movl    (%esi), %edx
    cmpl    $1, %ebx
    movl    %edx, %eax
    je  L23
    testl   %ebx, %ebx
    jne L6
L8:
    movl    28(%esp), %eax
    movl    %eax, 4(%esi)
    movb    $0, (%edx,%eax)
    addl    $32, %esp
    .cfi_remember_state
    .cfi_def_cfa_offset 16
    popl    %ebx
    .cfi_restore 3
    .cfi_def_cfa_offset 12
    popl    %esi
    .cfi_restore 6
    .cfi_def_cfa_offset 8
    popl    %edi
    .cfi_restore 7
    .cfi_def_cfa_offset 4
    ret $8
    .p2align 4,,10
L22:
    .cfi_restore_state
    leal    28(%esp), %eax
    movl    $0, 4(%esp)
    movl    %esi, %ecx
    movl    %eax, (%esp)
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj
    .cfi_def_cfa_offset 40
    subl    $8, %esp
    .cfi_def_cfa_offset 48
    movl    %eax, (%esi)
    movl    28(%esp), %edx
    movl    %edx, 8(%esi)
L6:
    movl    %ebx, 8(%esp)
    movl    %edi, 4(%esp)
    movl    %eax, (%esp)
    call    _memcpy
    movl    (%esi), %edx
    jmp L8
    .p2align 4,,10
L23:
    movzbl  (%edi), %eax
    movb    %al, (%edx)
    movl    (%esi), %edx
    jmp L8
    .cfi_endproc
LFE2587:
    .def    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21;   .scl    3;  .type   32; .endef
    .set    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21,__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
    .section    .text$_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z,"x"
    .linkonce discard
    .p2align 4,,15
    .globl  __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
    .def    __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z;    .scl    2;  .type   32; .endef
__ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z:
LFB2177:
    .cfi_startproc
    pushl   %ebp
    .cfi_def_cfa_offset 8
    .cfi_offset 5, -8
    movl    %esp, %ebp
    .cfi_def_cfa_register 5
    pushl   %esi
    pushl   %ebx
    subl    $16, %esp
    .cfi_offset 6, -12
    .cfi_offset 3, -16
    movl    16(%ebp), %edx
    movl    8(%ebp), %esi
    leal    30(%edx), %eax
    andl    $-16, %eax
    call    ___chkstk_ms
    subl    %eax, %esp
    leal    24(%ebp), %eax
    leal    31(%esp), %ebx
    movl    %edx, 4(%esp)
    movl    %eax, 12(%esp)
    movl    20(%ebp), %eax
    andl    $-16, %ebx
    movl    %ebx, (%esp)
    movl    %eax, 8(%esp)
    call    *12(%ebp)
    leal    8(%esi), %edx
    addl    %ebx, %eax
    movl    %esi, %ecx
    movl    %edx, (%esi)
    movl    %eax, 4(%esp)
    movl    %ebx, (%esp)
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
    subl    $8, %esp
    leal    -8(%ebp), %esp
    movl    %esi, %eax
    popl    %ebx
    .cfi_restore 3
    popl    %esi
    .cfi_restore 6
    popl    %ebp
    .cfi_restore 5
    .cfi_def_cfa 4, 4
    ret
    .cfi_endproc
LFE2177:
    .def    ___main;    .scl    2;  .type   32; .endef
    .section .rdata,"dr"
LC1:
    .ascii "\0"
LC2:
    .ascii "%d\0"
LC3:
    .ascii "basic_string::append\0"
LC4:
    .ascii " \0"
    .def    ___divdi3;  .scl    2;  .type   32; .endef
LC6:
    .ascii "Done in \0"
LC7:
    .ascii "\12\0"
    .section    .text.startup,"x"
    .p2align 4,,15
    .globl  _main
    .def    _main;  .scl    2;  .type   32; .endef
_main:
LFB2111:
    .cfi_startproc
    .cfi_personality 0,___gxx_personality_v0
    .cfi_lsda 0,LLSDA2111
    leal    4(%esp), %ecx
    .cfi_def_cfa 1, 0
    andl    $-16, %esp
    pushl   -4(%ecx)
    pushl   %ebp
    .cfi_escape 0x10,0x5,0x2,0x75,0
    movl    %esp, %ebp
    pushl   %edi
    pushl   %esi
    pushl   %ebx
    pushl   %ecx
    .cfi_escape 0xf,0x3,0x75,0x70,0x6
    .cfi_escape 0x10,0x7,0x2,0x75,0x7c
    .cfi_escape 0x10,0x6,0x2,0x75,0x78
    .cfi_escape 0x10,0x3,0x2,0x75,0x74
    subl    $152, %esp
    call    ___main
    call    __ZNSt6chrono3_V212system_clock3nowEv
    leal    -96(%ebp), %ecx
    movl    %eax, -136(%ebp)
    leal    -88(%ebp), %eax
    movl    $LC1, 4(%esp)
    movl    $LC1, (%esp)
    movl    %edx, -132(%ebp)
    movl    %eax, -96(%ebp)
LEHB0:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21
LEHE0:
    leal    -96(%ebp), %ecx
    subl    $8, %esp
    movl    $20000000, (%esp)
LEHB1:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj
LEHE1:
    subl    $4, %esp
    movl    $10000000, %edi
    leal    -72(%ebp), %esi
    leal    -40(%ebp), %ebx
    jmp L32
    .p2align 4,,10
L28:
    movl    %ecx, -48(%ebp)
    movl    8(%eax), %ecx
    movl    %ecx, -40(%ebp)
L29:
    movl    4(%eax), %ecx
    movb    $0, 8(%eax)
    movl    %ecx, -44(%ebp)
    movl    %edx, (%eax)
    leal    -96(%ebp), %ecx
    movl    $0, 4(%eax)
    movl    -44(%ebp), %eax
    movl    %eax, 4(%esp)
    movl    -48(%ebp), %eax
    movl    %eax, (%esp)
LEHB2:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE2:
    movl    -48(%ebp), %eax
    subl    $8, %esp
    cmpl    %ebx, %eax
    je  L30
    movl    %eax, (%esp)
    call    __ZdlPv
L30:
    movl    -72(%ebp), %eax
    leal    -64(%ebp), %edx
    cmpl    %edx, %eax
    je  L31
    movl    %eax, (%esp)
    call    __ZdlPv
L31:
    subl    $1, %edi
    je  L46
L32:
    movl    $1, 16(%esp)
    movl    $LC2, 12(%esp)
    movl    $16, 8(%esp)
    movl    $_vsnprintf, 4(%esp)
    movl    %esi, (%esp)
LEHB3:
    call    __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
LEHE3:
    cmpl    $2147483647, -68(%ebp)
    je  L47
    movl    $1, 4(%esp)
    movl    $LC4, (%esp)
    movl    %esi, %ecx
LEHB4:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE4:
    movl    %ebx, -48(%ebp)
    movl    (%eax), %ecx
    leal    8(%eax), %edx
    subl    $8, %esp
    cmpl    %edx, %ecx
    jne L28
    movl    12(%eax), %ecx
    movl    %ecx, -120(%ebp)
    movl    16(%eax), %ecx
    movl    %ecx, -124(%ebp)
    movl    20(%eax), %ecx
    movl    %ecx, -128(%ebp)
    movl    8(%eax), %ecx
    movl    %ecx, -40(%ebp)
    movl    -120(%ebp), %ecx
    movl    %ecx, -36(%ebp)
    movl    -124(%ebp), %ecx
    movl    %ecx, -32(%ebp)
    movl    -128(%ebp), %ecx
    movl    %ecx, -28(%ebp)
    jmp L29
    .p2align 4,,10
L46:
    call    __ZNSt6chrono3_V212system_clock3nowEv
    subl    -136(%ebp), %eax
    movl    $1000000, 8(%esp)
    sbbl    -132(%ebp), %edx
    movl    $0, 12(%esp)
    movl    %eax, (%esp)
    movl    %edx, 4(%esp)
    call    ___divdi3
    movl    %eax, -120(%ebp)
    movl    %edx, -116(%ebp)
    fildq   -120(%ebp)
    movl    $8, 8(%esp)
    movl    $LC6, 4(%esp)
    movl    $__ZSt4cout, (%esp)
    fdivs   LC5
    fstpl   -120(%ebp)
LEHB5:
    call    __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
    fldl    -120(%ebp)
    movl    $__ZSt4cout, %ecx
    fstpl   (%esp)
    call    __ZNSo9_M_insertIdEERSoT_
    subl    $8, %esp
    movl    $LC7, 4(%esp)
    movl    %eax, (%esp)
    call    __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
LEHE5:
    movl    -96(%ebp), %eax
    leal    -88(%ebp), %edi
    cmpl    %edi, %eax
    je  L43
    movl    %eax, (%esp)
    call    __ZdlPv
L43:
    leal    -16(%ebp), %esp
    xorl    %eax, %eax
    popl    %ecx
    .cfi_remember_state
    .cfi_restore 1
    .cfi_def_cfa 1, 0
    popl    %ebx
    .cfi_restore 3
    popl    %esi
    .cfi_restore 6
    popl    %edi
    .cfi_restore 7
    popl    %ebp
    .cfi_restore 5
    leal    -4(%ecx), %esp
    .cfi_def_cfa 4, 4
    ret
L47:
    .cfi_restore_state
    movl    $LC3, (%esp)
LEHB6:
    call    __ZSt20__throw_length_errorPKc
LEHE6:
L41:
    movl    %eax, %ebx
L36:
    movl    -72(%ebp), %eax
    leal    -64(%ebp), %edx
    cmpl    %edx, %eax
    je  L38
    movl    %eax, (%esp)
    call    __ZdlPv
L38:
    movl    -96(%ebp), %eax
    leal    -88(%ebp), %edi
    cmpl    %edi, %eax
    je  L39
    movl    %eax, (%esp)
    call    __ZdlPv
L39:
    movl    %ebx, (%esp)
LEHB7:
    call    __Unwind_Resume
LEHE7:
L42:
    movl    %eax, %esi
    movl    -48(%ebp), %eax
    cmpl    %ebx, %eax
    je  L35
    movl    %eax, (%esp)
    call    __ZdlPv
L35:
    movl    %esi, %ebx
    jmp L36
L40:
    movl    %eax, %ebx
    jmp L38
    .cfi_endproc
LFE2111:
    .def    ___gxx_personality_v0;  .scl    2;  .type   32; .endef
    .section    .gcc_except_table,"w"
LLSDA2111:
    .byte   0xff
    .byte   0xff
    .byte   0x1
    .uleb128 LLSDACSE2111-LLSDACSB2111
LLSDACSB2111:
    .uleb128 LEHB0-LFB2111
    .uleb128 LEHE0-LEHB0
    .uleb128 0
    .uleb128 0
    .uleb128 LEHB1-LFB2111
    .uleb128 LEHE1-LEHB1
    .uleb128 L40-LFB2111
    .uleb128 0
    .uleb128 LEHB2-LFB2111
    .uleb128 LEHE2-LEHB2
    .uleb128 L42-LFB2111
    .uleb128 0
    .uleb128 LEHB3-LFB2111
    .uleb128 LEHE3-LEHB3
    .uleb128 L40-LFB2111
    .uleb128 0
    .uleb128 LEHB4-LFB2111
    .uleb128 LEHE4-LEHB4
    .uleb128 L41-LFB2111
    .uleb128 0
    .uleb128 LEHB5-LFB2111
    .uleb128 LEHE5-LEHB5
    .uleb128 L40-LFB2111
    .uleb128 0
    .uleb128 LEHB6-LFB2111
    .uleb128 LEHE6-LEHB6
    .uleb128 L41-LFB2111
    .uleb128 0
    .uleb128 LEHB7-LFB2111
    .uleb128 LEHE7-LEHB7
    .uleb128 0
    .uleb128 0
LLSDACSE2111:
    .section    .text.startup,"x"
    .p2align 4,,15
    .def    __GLOBAL__sub_I_main;   .scl    3;  .type   32; .endef
__GLOBAL__sub_I_main:
LFB2557:
    .cfi_startproc
    subl    $28, %esp
    .cfi_def_cfa_offset 32
    movl    $__ZStL8__ioinit, %ecx
    call    __ZNSt8ios_base4InitC1Ev
    movl    $___tcf_0, (%esp)
    call    _atexit
    addl    $28, %esp
    .cfi_def_cfa_offset 4
    ret
    .cfi_endproc
LFE2557:
    .section    .ctors,"w"
    .align 4
    .long   __GLOBAL__sub_I_main
.lcomm __ZStL8__ioinit,1,1
    .section .rdata,"dr"
    .align 4
LC5:
    .long   1148846080
    .ident  "GCC: (MinGW.org GCC-6.3.0-1) 6.3.0"
    .def    __ZNSt8ios_base4InitD1Ev;   .scl    2;  .type   32; .endef
    .def    __ZSt19__throw_logic_errorPKc;  .scl    2;  .type   32; .endef
    .def    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj;   .scl    2;  .type   32; .endef
    .def    _memcpy;    .scl    2;  .type   32; .endef
    .def    __ZNSt6chrono3_V212system_clock3nowEv;  .scl    2;  .type   32; .endef
    .def    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj;   .scl    2;  .type   32; .endef
    .def    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj;  .scl    2;  .type   32; .endef
    .def    __ZdlPv;    .scl    2;  .type   32; .endef
    .def    _vsnprintf; .scl    2;  .type   32; .endef
    .def    __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl    2;  .type   32; .endef
    .def    __ZNSo9_M_insertIdEERSoT_;  .scl    2;  .type   32; .endef
    .def    __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc;   .scl    2;  .type   32; .endef
    .def    __ZSt20__throw_length_errorPKc; .scl    2;  .type   32; .endef
    .def    __Unwind_Resume;    .scl    2;  .type   32; .endef
    .def    __ZNSt8ios_base4InitC1Ev;   .scl    2;  .type   32; .endef
    .def    _atexit;    .scl    2;  .type   32; .endef

Upvotes: 4

Views: 408

Answers (5)

Amit G.
Amit G.

Reputation: 2674

(Just for the proportions) Windows Release target vs. Debug target on Visual Studio C++: By default, Debug target compile-line is without optimization, while Release target compile-line is with /O2 optimization, with /Oi ("Enable Intrinsic Functions"), & with /GL ("Whole Program Optimization"). Your code, on my workstation, Debug x64 vs Relesae x64:

Debug: 70 sec.

Release: 0.27 sec.

You build with MinGW (which I am not familiar with). But from a fast search, there is a talk about Debug/Release mode ...and MinGW has equivalent /O2 optimization, /Oi ("Enable Intrinsic Functions"), and /Og ("Enable Global Optimization") flags, it seems.

-

Compile with these 3 flags (x64 target), & compare with the VS Release x64 benchmark. Anyway, this is MS default compile optimization for a Release target.

-

Test Environment: HP 8100, Windows 10 Pro 64 bit, CPU i7 870, 16 GB DDR3 RAM, Visual Studio 2017, Targets: Debug x64 / Release x64

Upvotes: 0

Darklighter
Darklighter

Reputation: 2192

The mingw.org implementation just seems to be much more inefficient than linux, Visual Studio or mingw-w64.org.

>g++ --version
g++ (MinGW.org GCC-6.3.0-1) 6.3.0

Done in 24.808

enter image description here

>g++ --version
g++ (i686-posix-dwarf-rev2, Built by MinGW-W64 project) 6.3.0

Done in 0.679

enter image description here

Upvotes: 3

Victor Gubin
Victor Gubin

Reputation: 2937

Tested with MSYS2 MinGW64:

g++ --version
g++.exe (Rev2, Built by MSYS2 project) 7.3.0

g++.exe -Wall -O3 -mtune=native -fno-exceptions -fno-rtti -c main.cpp -o main.o
g++.exe  -o test.exe main.o -s  

Done in 0.547

Env: Windows 10 x64 CPU: Intel Core i5-6300U, 2.4GH RAM: 16GB DDR4

In any case, MinGW uses mswcrt.dll instead of GNU libc (windows bundled one, not a universal CRT/visual studio CRT etc) so speed gap may comes from C standard library from my experience.

P.S. with some changes (same compiler flags)

#include <iostream>
#include <chrono>

#ifdef _WIN32
#include <windows.h>
static std::size_t page_size() noexcept {
    ::SYSTEM_INFO si;
    ::GetSystemInfo(&si);
    return si.dwPageSize;
}
#else
#include <sys/types.h>
#include <unistd.h>
static std::size_t page_size() noexcept {
    return static_cast<std::size_t>( ::sysconf(_SC_PAGESIZE) );
}
#endif // _WIN32


int main(int argc, char const *argv[]) {

    auto started = std::chrono::high_resolution_clock::now();
    const std::size_t n = 10000000;
    // align size to page boundary
    const std::size_t al = page_size() - 1;
    const std::size_t buff_size = ( (n << 1) + al) & ~al;
    std::string str;
    str.reserve(buff_size);

    const std::string to_append( std::to_string(1) );

    for (std::size_t i = 0; i < n; ++i) {
        str.append( to_append );
        str.push_back(' ');
    }

    auto done = std::chrono::high_resolution_clock::now();
    double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
    std::cout << "Done in " << secs << "\n";
    return 0;
}

Done in 0.046

Asm ouput for main function:

main:
    pushq   %r14
    .seh_pushreg    %r14
    pushq   %r13
    .seh_pushreg    %r13
    pushq   %r12
    .seh_pushreg    %r12
    pushq   %rbp
    .seh_pushreg    %rbp
    pushq   %rdi
    .seh_pushreg    %rdi
    pushq   %rsi
    .seh_pushreg    %rsi
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    $144, %rsp
    .seh_stackalloc 144
    .seh_endprologue
    movl    $10000000, %esi
    call    __main
    leaq    96(%rsp), %r13
    leaq    64(%rsp), %rbp
    call    _ZNSt6chrono3_V212system_clock3nowEv
    movq    %r13, %rcx
    leaq    16(%rbp), %r12
    movq    %rax, %r14
    call    *__imp_GetSystemInfo(%rip)
    movl    100(%rsp), %eax
    movq    %rbp, %rcx
    movq    %r12, 64(%rsp)
    movq    $0, 72(%rsp)
    leaq    19999999(%rax), %rdx
    negq    %rax
    movb    $0, 80(%rsp)
    andq    %rax, %rdx
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEy
    movl    $1, 32(%rsp)
    movq    %r13, %rcx
    leaq    .LC0(%rip), %r9
    movl    $16, %r8d
    leaq    _ZL9vsnprintfPcyPKcS_(%rip), %rdx
    call    _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_yPKS8_PcEySB_z
    jmp .L14
    .p2align 4,,10
.L16:
    movb    $32, (%rdx,%rbx)
.L26:
    movq    64(%rsp), %rax
    movq    %rdi, 72(%rsp)
    movb    $0, 1(%rax,%rbx)
    subq    $1, %rsi
    je  .L27
.L14:
    movq    96(%rsp), %rdx
    movq    104(%rsp), %r8
    movq    %rbp, %rcx
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcy
    movq    72(%rsp), %rbx
    movq    64(%rsp), %rdx
    movl    $15, %eax
    leaq    1(%rbx), %rdi
    cmpq    %r12, %rdx
    je  .L15
    movq    80(%rsp), %rax
.L15:
    cmpq    %rax, %rdi
    jbe .L16
    xorl    %r9d, %r9d
    xorl    %r8d, %r8d
    movq    %rbx, %rdx
    movq    %rbp, %rcx
    movq    $1, 32(%rsp)
    call    _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_mutateEyyPKcy
    movq    64(%rsp), %rax
    movb    $32, (%rax,%rbx)
    jmp .L26
    .p2align 4,,10
.L27:
    call    _ZNSt6chrono3_V212system_clock3nowEv
    pxor    %xmm1, %xmm1
    movl    $8, %r8d
    movabsq $4835703278458516699, %rdx
    subq    %r14, %rax
    addq    $16, %r13
    movq    %rax, %rcx
    imulq   %rdx
    sarq    $63, %rcx
    sarq    $18, %rdx
    subq    %rcx, %rdx
    movq    .refptr._ZSt4cout(%rip), %rcx
    cvtsi2sdq   %rdx, %xmm1
    leaq    .LC2(%rip), %rdx
    divsd   .LC1(%rip), %xmm1
    movsd   %xmm1, 56(%rsp)
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x
    movsd   56(%rsp), %xmm1
    movq    .refptr._ZSt4cout(%rip), %rcx
    call    _ZNSo9_M_insertIdEERSoT_
    leaq    .LC3(%rip), %rdx
    movq    %rax, %rcx
    call    _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
    movq    96(%rsp), %rcx
    cmpq    %r13, %rcx
    je  .L19
    call    _ZdlPv
.L19:
    movq    64(%rsp), %rcx
    addq    $16, %rbp
    cmpq    %rbp, %rcx
    je  .L20
    call    _ZdlPv
.L20:
    xorl    %eax, %eax
    addq    $144, %rsp
    popq    %rbx
    popq    %rsi
    popq    %rdi
    popq    %rbp
    popq    %r12
    popq    %r13
    popq    %r14
    ret
    .seh_endproc
    .p2align 4,,15
    .def    _GLOBAL__sub_I_main;    .scl    3;  .type   32; .endef
    .seh_proc   _GLOBAL__sub_I_main

Upvotes: 1

joe_chip
joe_chip

Reputation: 2558

Quick look at disassembly shows that Windows version uses movl (i. e. long word, 32 bit move) and Linux version uses movq (quad word, 64 bit) and SSE registers xmm.

My bet is that on Linux, you compile for x86-64, while on Windows you target 32 bit x86.

x86-64 includes SSE2 extension, while x86 does not, so MinGW defaults to no-SSE mode.

If that's the case, building with 64 bit toolchain on Windows should result in comparable performance. Alternatively, you might enable SSE for 32 bit builds (-msse2 compiler flag, if I remember correctly).

Upvotes: 3

borisbn
borisbn

Reputation: 5054

I tried your code at my Windows with MinGW 4.8.0 and got ~20 seconds. When I changed string concatination to std::stringstream I got 0.5 seconds:

...
std::stringstream ss;

for (int i = 0; i < n; ++i) {
    //str += std::to_string(a) + " ";
    ss << a << " ";
}
str = ss.str();
...

Upvotes: -1

Related Questions