Identical functions have different performance, why?

Question

I made a program to test performance of different functions, that do the same in different ways. However, some of them generate the almost exact same machine code except for different addresses and when i measure the execution time one of them is faster than the other. This happens consistently in almost every run and i have no idea why. Strange is also that when i remove some other test functions, the difference dispersal. I wrote the code for another answer on StackOverflow and want to understand why.

I run on Debian 10 Buster, AMD64 and use GCC. I compile it with -O3. Below is my code. The function lowerA() is almost every time faster than lower9(), despite the same code If i remove the unused functions lower1()-lower8() the difference goes away.

#include 
#include 
#include 
#include 
#include 
#include 

#include 


//#define DEBUG


#ifdef DEBUG
  #define N 10
#else
  #define N 1000UL*100
#endif

#define M 20

#define STR_(x) #x
#define STR(x) STR_(x)



void lower1(char *s)
{
  size_t i;

  for (i = 0; i < strlen(s); i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] -= ('A' - 'a');
    }
  }
}

void lower2(char *s)
{
  size_t i;
  size_t len = strlen(s);

  for (i = 0; i < len; i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] -= ('A' - 'a');
    }
  }
}

void lower3(char *s)
{
  size_t i;
  size_t len = strlen(s);
  int d='A'-'a';

  for (i = 0; i < len; i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] -= d;
    }
  }
}

void lower4(char *s)
{
  size_t i;
  size_t len = strlen(s);

  for (i = 0; i < len; i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] += 0x20;
    }
  }
}

void lower5(char *s)
{
  size_t i;

  for (i = 0; i < strlen(s); i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] += ('a' - 'A');
    }
  }
}

void lower6(char *s)
{
  size_t i;
  for (i = 0; i < strlen(s); i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] |= 0x20;
    }
  }
}

void lower7(char *s)
{
  size_t i;
  size_t len = strlen(s);

  for (i = 0; i < len; i++)
  {
    if (s[i] >= 'A' && s[i] <= 'Z')
    {
      s[i] |= 0x20;
    }
  }
}

void lower8(char *s)
{
  size_t len = strlen(s);
  while(len--)
    {
      if (*s >= 'A' && *s <= 'Z')
      {
        *s |= 0x20;
      }
      s++;
    }
}

void lower9(char *s)
{
  while(1)
  {
    if (!*s)
    {
      break;
    }
    if (*s >= 'A' && *s <= 'Z')
    {
      *s |= 0x20;
    }
    s++;
  }
}

void lowerA(char *s)
{
  while(*s)
  {
    if (*s >= 'A' && *s <= 'Z')
    {
      *s |= 0x20;
    }
    s++;
  }
}

uint64_t die(const char *msg)
  {
    fprintf(stderr,"die: %s : %s
",msg,strerror(errno));
    exit(1);
  }

uint64_t getTime(void)
  {
    uint64_t time;
    struct timespec  t_v;
    if(clock_gettime(CLOCK_BOOTTIME,&t_v)<0)
      {
        die("cant get time");
      }
    time=t_v.tv_sec*1000000000ULL;
    time+=t_v.tv_nsec;
    return time;
  }
  

void test(void (*fp)(char *),char (*s)[M],const char *name)
  {
    static char (*copy)[M];
    copy=malloc(N*M);
    if(!copy)
      {
        die("can't alloc memory");
      }
    memcpy(copy,s,N*M);
    uint64_t start=getTime();
    for(size_t u=0;u


This is the disassembly of the function lower9() and lowerA():
Dump of assembler code for function lower9:
   0x00000000000017b0 <+0>: movzbl (%rdi),%eax
   0x00000000000017b3 <+3>: test   %al,%al
   0x00000000000017b5 <+5>: je     0x17eb 
   0x00000000000017b7 <+7>: nopw   0x0(%rax,%rax,1)
   0x00000000000017c0 <+16>:    lea    -0x41(%rax),%edx
   0x00000000000017c3 <+19>:    cmp    $0x19,%dl
   0x00000000000017c6 <+22>:    ja     0x17e0 
   0x00000000000017c8 <+24>:    or     $0x20,%eax
   0x00000000000017cb <+27>:    add    $0x1,%rdi
   0x00000000000017cf <+31>:    mov    %al,-0x1(%rdi)
   0x00000000000017d2 <+34>:    movzbl (%rdi),%eax
   0x00000000000017d5 <+37>:    test   %al,%al
   0x00000000000017d7 <+39>:    jne    0x17c0 
   0x00000000000017d9 <+41>:    retq   
   0x00000000000017da <+42>:    nopw   0x0(%rax,%rax,1)
   0x00000000000017e0 <+48>:    add    $0x1,%rdi
   0x00000000000017e4 <+52>:    movzbl (%rdi),%eax
   0x00000000000017e7 <+55>:    test   %al,%al
   0x00000000000017e9 <+57>:    jne    0x17c0 
   0x00000000000017eb <+59>:    retq   
End of assembler dump.
Dump of assembler code for function lowerA:
   0x00000000000017f0 <+0>: movzbl (%rdi),%eax
   0x00000000000017f3 <+3>: test   %al,%al
   0x00000000000017f5 <+5>: je     0x182b 
   0x00000000000017f7 <+7>: nopw   0x0(%rax,%rax,1)
   0x0000000000001800 <+16>:    lea    -0x41(%rax),%edx
   0x0000000000001803 <+19>:    cmp    $0x19,%dl
   0x0000000000001806 <+22>:    ja     0x1820 
   0x0000000000001808 <+24>:    or     $0x20,%eax
   0x000000000000180b <+27>:    add    $0x1,%rdi
   0x000000000000180f <+31>:    mov    %al,-0x1(%rdi)
   0x0000000000001812 <+34>:    movzbl (%rdi),%eax
   0x0000000000001815 <+37>:    test   %al,%al
   0x0000000000001817 <+39>:    jne    0x1800 
   0x0000000000001819 <+41>:    retq   
   0x000000000000181a <+42>:    nopw   0x0(%rax,%rax,1)
   0x0000000000001820 <+48>:    add    $0x1,%rdi
   0x0000000000001824 <+52>:    movzbl (%rdi),%eax
   0x0000000000001827 <+55>:    test   %al,%al
   0x0000000000001829 <+57>:    jne    0x1800 
   0x000000000000182b <+59>:    retq   
End of assembler dump.

One example result, sorted:
time       4145299 lowerA
time       4153573 lowerA
time       4155081 lowerA
time       4158537 lowerA
time       4173954 lowerA
time       4190982 lowerA
time       4196201 lowerA
time       4202252 lower9
time       4209932 lower9
time       4214722 lowerA
time       4215861 lower9
time       4216471 lower9
time       4243532 lower9
time       4315601 lower9
time       4323003 lower9
time       4331462 lower9

Why is lowerA() a tiny bit faster?
Edit
with clang i do not get the same behavior.
Edit2
Swapping the calls for lower9() and lowerA() makes lower9() faster.

(Editor's note: as described in Idiomatic way of performance evaluation?, that can mean insufficient warm-up, or in this case more likely that it comes down to some fiddly difference in code alignment after inlining.  Especially given that clang showed no difference.)
Edit3
I use a Intel Xeon CPU W3570  @ 3.20GHz, Bloomfield microarchitecture, aka Nehalem server.
Edit4
gcc --version gives gcc (Debian 8.3.0-6) 8.3.0
Peter Cordes said the code could be inlined, i think he is correct, this is the dissasemply of runTest():
Dump of assembler code for function runTest:
   0x00000000000019d0 <+0>: push   %r15
   0x00000000000019d2 <+2>: mov    $0x7,%edi
   0x00000000000019d7 <+7>: push   %r14
   0x00000000000019d9 <+9>: push   %r13
   0x00000000000019db <+11>:    push   %r12
   0x00000000000019dd <+13>:    push   %rbp
   0x00000000000019de <+14>:    push   %rbx
   0x00000000000019df <+15>:    sub    $0x58,%rsp
   0x00000000000019e3 <+19>:    mov    %rsp,%rsi
   0x00000000000019e6 <+22>:    callq  0x1050 
   0x00000000000019eb <+27>:    test   %eax,%eax
   0x00000000000019ed <+29>:    js     0x1cd1 
   0x00000000000019f3 <+35>:    imul   $0x3b9aca00,(%rsp),%rdi
   0x00000000000019fb <+43>:    add    0x8(%rsp),%rdi
   0x0000000000001a00 <+48>:    xor    %r14d,%r14d
   0x0000000000001a03 <+51>:    xor    %r13d,%r13d
   0x0000000000001a06 <+54>:    callq  0x1080 
   0x0000000000001a0b <+59>:    nopl   0x0(%rax,%rax,1)
   0x0000000000001a10 <+64>:    callq  0x10e0 
   0x0000000000001a15 <+69>:    mov    %eax,%ebx
   0x0000000000001a17 <+71>:    mov    $0x66666667,%eax
   0x0000000000001a1c <+76>:    imul   %ebx
   0x0000000000001a1e <+78>:    mov    %ebx,%eax
   0x0000000000001a20 <+80>:    sar    $0x1f,%eax
   0x0000000000001a23 <+83>:    sar    $0x3,%edx
   0x0000000000001a26 <+86>:    sub    %eax,%edx
   0x0000000000001a28 <+88>:    lea    (%rdx,%rdx,4),%eax
   0x0000000000001a2b <+91>:    shl    $0x2,%eax
   0x0000000000001a2e <+94>:    sub    %eax,%ebx
   0x0000000000001a30 <+96>:    movslq %ebx,%rbx
   0x0000000000001a33 <+99>:    test   %rbx,%rbx
   0x0000000000001a36 <+102>:   je     0x1a78 
   0x0000000000001a38 <+104>:   lea    0x2681(%rip),%rax        # 0x40c0 
   0x0000000000001a3f <+111>:   mov    $0x1b4e81b5,%ebp
   0x0000000000001a44 <+116>:   lea    (%rax,%r14,1),%r15
   0x0000000000001a48 <+120>:   lea    (%rbx,%r15,1),%r12
   0x0000000000001a4c <+124>:   nopl   0x0(%rax)
   0x0000000000001a50 <+128>:   callq  0x10e0 
   0x0000000000001a55 <+133>:   add    $0x1,%r15
   0x0000000000001a59 <+137>:   mov    %eax,%ecx
   0x0000000000001a5b <+139>:   imul   %ebp
   0x0000000000001a5d <+141>:   mov    %ecx,%eax
   0x0000000000001a5f <+143>:   sar    $0x1f,%eax
   0x0000000000001a62 <+146>:   sar    $0x3,%edx
   0x0000000000001a65 <+149>:   sub    %eax,%edx
   0x0000000000001a67 <+151>:   imul   $0x4b,%edx,%edx
   0x0000000000001a6a <+154>:   sub    %edx,%ecx
   0x0000000000001a6c <+156>:   add    $0x2f,%ecx
   0x0000000000001a6f <+159>:   mov    %cl,-0x1(%r15)
   0x0000000000001a73 <+163>:   cmp    %r12,%r15
   0x0000000000001a76 <+166>:   jne    0x1a50 
   0x0000000000001a78 <+168>:   lea    0x0(%r13,%r13,4),%rax
   0x0000000000001a7d <+173>:   lea    0x263c(%rip),%rdi        # 0x40c0 
   0x0000000000001a84 <+180>:   add    $0x1,%r13
   0x0000000000001a88 <+184>:   add    $0x14,%r14
   0x0000000000001a8c <+188>:   lea    (%rdi,%rax,4),%rax
   0x0000000000001a90 <+192>:   movb   $0x0,(%rax,%rbx,1)
   0x0000000000001a94 <+196>:   cmp    $0x186a0,%r13
   0x0000000000001a9b <+203>:   jne    0x1a10 
   0x0000000000001aa1 <+209>:   mov    $0x1e8480,%edi
   0x0000000000001aa6 <+214>:   callq  0x10b0 
   0x0000000000001aab <+219>:   mov    %rax,%r12
   0x0000000000001aae <+222>:   mov    %rax,0x1eaa8b(%rip)        # 0x1ec540 
   0x0000000000001ab5 <+229>:   test   %rax,%rax
   0x0000000000001ab8 <+232>:   je     0x1cdd 
   0x0000000000001abe <+238>:   lea    0x25fb(%rip),%rsi        # 0x40c0 
   0x0000000000001ac5 <+245>:   mov    %rax,%rdi
   0x0000000000001ac8 <+248>:   mov    $0x1e8480,%edx
   0x0000000000001acd <+253>:   callq  0x10a0 
   0x0000000000001ad2 <+258>:   lea    0x10(%rsp),%rsi
   0x0000000000001ad7 <+263>:   mov    $0x7,%edi
   0x0000000000001adc <+268>:   callq  0x1050 
   0x0000000000001ae1 <+273>:   test   %eax,%eax
   0x0000000000001ae3 <+275>:   js     0x1cd1 
   0x0000000000001ae9 <+281>:   mov    0x18(%rsp),%rbp
   0x0000000000001aee <+286>:   mov    %r12,%rsi
   0x0000000000001af1 <+289>:   imul   $0x3b9aca00,0x10(%rsp),%rbx
   0x0000000000001afa <+298>:   lea    0x1e8480(%r12),%rdi
   0x0000000000001b02 <+306>:   nopw   0x0(%rax,%rax,1)
   0x0000000000001b08 <+312>:   movzbl (%rsi),%eax
   0x0000000000001b0b <+315>:   mov    %rsi,%rdx
   0x0000000000001b0e <+318>:   test   %al,%al
   0x0000000000001b10 <+320>:   je     0x1b35 
   0x0000000000001b12 <+322>:   nopw   0x0(%rax,%rax,1)
   0x0000000000001b18 <+328>:   lea    -0x41(%rax),%ecx
   0x0000000000001b1b <+331>:   cmp    $0x19,%cl
   0x0000000000001b1e <+334>:   ja     0x1c88 
   0x0000000000001b24 <+340>:   or     $0x20,%eax
   0x0000000000001b27 <+343>:   add    $0x1,%rdx
   0x0000000000001b2b <+347>:   mov    %al,-0x1(%rdx)
   0x0000000000001b2e <+350>:   movzbl (%rdx),%eax
   0x0000000000001b31 <+353>:   test   %al,%al
   0x0000000000001b33 <+355>:   jne    0x1b18 
   0x0000000000001b35 <+357>:   add    $0x14,%rsi
   0x0000000000001b39 <+361>:   cmp    %rdi,%rsi
   0x0000000000001b3c <+364>:   jne    0x1b08 
   0x0000000000001b3e <+366>:   lea    0x20(%rsp),%rsi
   0x0000000000001b43 <+371>:   mov    $0x7,%edi
   0x0000000000001b48 <+376>:   callq  0x1050 
   0x0000000000001b4d <+381>:   test   %eax,%eax
   0x0000000000001b4f <+383>:   js     0x1cd1 
   0x0000000000001b55 <+389>:   lea    0x4e6(%rip),%rdx        # 0x2042
   0x0000000000001b5c <+396>:   lea    0x4d0(%rip),%rdi        # 0x2033
   0x0000000000001b63 <+403>:   xor    %eax,%eax
   0x0000000000001b65 <+405>:   imul   $0x3b9aca00,0x20(%rsp),%rsi
   0x0000000000001b6e <+414>:   sub    %rbp,%rsi
   0x0000000000001b71 <+417>:   add    0x28(%rsp),%rsi
   0x0000000000001b76 <+422>:   sub    %rbx,%rsi
   0x0000000000001b79 <+425>:   callq  0x1070 
   0x0000000000001b7e <+430>:   mov    0x1ea9bb(%rip),%rdi        # 0x1ec540 
   0x0000000000001b85 <+437>:   callq  0x1030 
   0x0000000000001b8a <+442>:   mov    $0x1e8480,%edi
   0x0000000000001b8f <+447>:   callq  0x10b0 
   0x0000000000001b94 <+452>:   mov    %rax,%r12
   0x0000000000001b97 <+455>:   mov    %rax,0x1ea9a2(%rip)        # 0x1ec540 
   0x0000000000001b9e <+462>:   test   %rax,%rax
   0x0000000000001ba1 <+465>:   je     0x1cdd 
   0x0000000000001ba7 <+471>:   lea    0x2512(%rip),%rsi        # 0x40c0 
   0x0000000000001bae <+478>:   mov    %rax,%rdi
   0x0000000000001bb1 <+481>:   mov    $0x1e8480,%edx
   0x0000000000001bb6 <+486>:   callq  0x10a0 
   0x0000000000001bbb <+491>:   lea    0x30(%rsp),%rsi
   0x0000000000001bc0 <+496>:   mov    $0x7,%edi
   0x0000000000001bc5 <+501>:   callq  0x1050 
   0x0000000000001bca <+506>:   test   %eax,%eax
   0x0000000000001bcc <+508>:   js     0x1cd1 
   0x0000000000001bd2 <+514>:   mov    0x38(%rsp),%rbp
   0x0000000000001bd7 <+519>:   mov    %r12,%rsi
   0x0000000000001bda <+522>:   imul   $0x3b9aca00,0x30(%rsp),%rbx
   0x0000000000001be3 <+531>:   lea    0x1e8480(%r12),%rdi
   0x0000000000001beb <+539>:   nopl   0x0(%rax,%rax,1)
   0x0000000000001bf0 <+544>:   movzbl (%rsi),%eax
   0x0000000000001bf3 <+547>:   mov    %rsi,%rdx
   0x0000000000001bf6 <+550>:   test   %al,%al
   0x0000000000001bf8 <+552>:   je     0x1c1d 
   0x0000000000001bfa <+554>:   nopw   0x0(%rax,%rax,1)
   0x0000000000001c00 <+560>:   lea    -0x41(%rax),%ecx
   0x0000000000001c03 <+563>:   cmp    $0x19,%cl
   0x0000000000001c06 <+566>:   ja     0x1cb0 
   0x0000000000001c0c <+572>:   or     $0x20,%eax
   0x0000000000001c0f <+575>:   add    $0x1,%rdx
   0x0000000000001c13 <+579>:   mov    %al,-0x1(%rdx)
   0x0000000000001c16 <+582>:   movzbl (%rdx),%eax
   0x0000000000001c19 <+585>:   test   %al,%al
   0x0000000000001c1b <+587>:   jne    0x1c00 
   0x0000000000001c1d <+589>:   add    $0x14,%rsi
   0x0000000000001c21 <+593>:   cmp    %rsi,%rdi
   0x0000000000001c24 <+596>:   jne    0x1bf0 
   0x0000000000001c26 <+598>:   lea    0x40(%rsp),%rsi
   0x0000000000001c2b <+603>:   mov    $0x7,%edi
   0x0000000000001c30 <+608>:   callq  0x1050 
   0x0000000000001c35 <+613>:   test   %eax,%eax
   0x0000000000001c37 <+615>:   js     0x1cd1 
   0x0000000000001c3d <+621>:   lea    0x405(%rip),%rdx        # 0x2049
   0x0000000000001c44 <+628>:   lea    0x3e8(%rip),%rdi        # 0x2033
   0x0000000000001c4b <+635>:   xor    %eax,%eax
   0x0000000000001c4d <+637>:   imul   $0x3b9aca00,0x40(%rsp),%rsi
   0x0000000000001c56 <+646>:   sub    %rbp,%rsi
   0x0000000000001c59 <+649>:   add    0x48(%rsp),%rsi
   0x0000000000001c5e <+654>:   sub    %rbx,%rsi
   0x0000000000001c61 <+657>:   callq  0x1070 
   0x0000000000001c66 <+662>:   mov    0x1ea8d3(%rip),%rdi        # 0x1ec540 
   0x0000000000001c6d <+669>:   callq  0x1030 
   0x0000000000001c72 <+674>:   add    $0x58,%rsp
   0x0000000000001c76 <+678>:   pop    %rbx
   0x0000000000001c77 <+679>:   pop    %rbp
   0x0000000000001c78 <+680>:   pop    %r12
   0x0000000000001c7a <+682>:   pop    %r13
   0x0000000000001c7c <+684>:   pop    %r14
   0x0000000000001c7e <+686>:   pop    %r15
   0x0000000000001c80 <+688>:   retq   
   0x0000000000001c81 <+689>:   nopl   0x0(%rax)
   0x0000000000001c88 <+696>:   add    $0x1,%rdx
   0x0000000000001c8c <+700>:   movzbl (%rdx),%eax
   0x0000000000001c8f <+703>:   test   %al,%al
   0x0000000000001c91 <+705>:   jne    0x1b18 
   0x0000000000001c97 <+711>:   add    $0x14,%rsi
   0x0000000000001c9b <+715>:   cmp    %rdi,%rsi
   0x0000000000001c9e <+718>:   jne    0x1b08 
   0x0000000000001ca4 <+724>:   jmpq   0x1b3e 
   0x0000000000001ca9 <+729>:   nopl   0x0(%rax)
   0x0000000000001cb0 <+736>:   add    $0x1,%rdx
   0x0000000000001cb4 <+740>:   movzbl (%rdx),%eax
   0x0000000000001cb7 <+743>:   test   %al,%al
   0x0000000000001cb9 <+745>:   jne    0x1c00 
   0x0000000000001cbf <+751>:   add    $0x14,%rsi
   0x0000000000001cc3 <+755>:   cmp    %rsi,%rdi
   0x0000000000001cc6 <+758>:   jne    0x1bf0 
   0x0000000000001ccc <+764>:   jmpq   0x1c26 
   0x0000000000001cd1 <+769>:   lea    0x33a(%rip),%rdi        # 0x2012
   0x0000000000001cd8 <+776>:   callq  0x1850 
   0x0000000000001cdd <+781>:   lea    0x33c(%rip),%rdi        # 0x2020
   0x0000000000001ce4 <+788>:   callq  0x1850 
End of assembler dump.

Edit5 The inlined parts are also identical, as far i understand it.

Identical functions have different performance, why?

Answers (1)

Related Questions