Why is `mov %eax, %eax; nop` faster than `nop`?

Question

Apparently, modern processors can tell if you do something stupid like moving a register to itself (mov %eax, %eax) and optimize that out. Trying to verify that claim, I ran the following program:

#include 
#include 

static inline void f1() {
   for (int i = 0; i < 100000000; i++)
      __asm__(
            "mov %eax, %eax;"
            "nop;"
            );
}

static inline void f2() {
   for (int i = 0; i < 100000000; i++)
      __asm__(
            "nop;"
            );
}

static inline void f3() {
   for (int i = 0; i < 100000000; i++)
      __asm__(
            "mov %ebx, %eax;"
            "nop;"
            );
}

int main() {
   int NRUNS = 10;
   clock_t t, t1, t2, t3;

   t1 = t2 = t3 = 0;
   for (int run = 0; run < NRUNS; run++) {
      t = clock(); f1(); t1 += clock()-t;
      t = clock(); f2(); t2 += clock()-t;
      t = clock(); f3(); t3 += clock()-t;
   }

   printf("f1() took %f cycles on avg
", (float) t1/ (float) NRUNS);
   printf("f2() took %f cycles on avg
", (float) t2/ (float) NRUNS);
   printf("f3() took %f cycles on avg
", (float) t3/ (float) NRUNS);

   return 0;
}

This gives me:

f1() took 175587.093750 cycles on avg
f2() took 188313.906250 cycles on avg
f3() took 194654.296875 cycles on avg

As one expect, f3() comes out slowest. But surprisingly (to me at least), f1() is faster than f2(). Why is that?

Update: Compiling with -falign-loops gives qualitatively the same result:

f1() took 164271.000000 cycles on avg
f2() took 173783.296875 cycles on avg
f3() took 177765.203125 cycles on avg

Why is `mov %eax, %eax; nop` faster than `nop`?

Answers (1)

Related Questions