Reputation: 189

When moving a function and it's implementation to a different file (.hpp and .cpp) from the main file, performance greatly suffers

In my main file (the one with the main function) I have another function:

unsigned long generate_random_number()
{
    unsigned long y;
    static unsigned long mag01[2] = {0x0UL, MATRIX_A};

    // mag01[x] = x * MATRIX_A  for x=0,1
    if (mti >= N) // generate N words at one time
    { 
        int kk;
        if (mti == N+1)   // if init_genrand() has not been called
            init_genrand(5489UL); // a default initial seed is used

        for (kk=0;kk<N-M;kk++) {
            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
            mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        for (;kk<N-1;kk++) {
            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
            mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
        mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];

        mti = 0;
    }

    y = mt[mti++];
    // Tempering
    y ^= (y >> 11);
    y ^= (y << 7) & 0x9d2c5680UL;
    y ^= (y << 15) & 0xefc60000UL;
    y ^= (y >> 18);
    return y;
}

Essentially it is from this website: http://www.the-control-freak.com/Random/Random.htm

If I move the function and the definitions to a header file and the implementation to a cpp file (still not in a class) and link the .o object file to my main program, the performance greatly suffers. The function goes from being ~11% of the overhead to ~15% (from google perf). Any ideas as to why that is?

In general if you link a object file does it take a lot of overhead to call a function from the object file?

makefile:

CXX = clang++
CFLAGS = -std=c++17 -O3 -Wall -Iinclude/ 
SRC = src/
INC = include/

random.o: $(SRC)random.cpp $(INC)random.hpp
    $(CXX) $(CFLAGS) -c $(SRC)random.cpp

myprog: myprog.cpp random.o 
    $(CXX) $(CFLAGS) -o refactor myprog.cpp random.o -lprofiler

a boiled down version of myprog.cpp. Instead of inside main, the case statement is in another function. That function is called N times and the average, stdev is sent via a socket.

myprog.cpp

int main()
{
    switch(hurry_ind)
    {
        case 0: return generate_random_number() % 19;
        break;
        case 1: return generate_random_number() % 100;
        break;
        case 2: return generate_random_number() % 9;
        break;
        case 3: return generate_random_number() % 914;
        break;
        case 4: return generate_random_number() % 355;
        break;
        case 5: return generate_random_number() % 348;
        break;
        case 6: return generate_random_number() % 65;
        break;
    }
}

Upvotes: 2

Answers (2)

aparpara

Reputation: 2201

A compiler can inline a frequently-called function and make some aggressive optimizations if the function is defined in the same compilation unit. Most probably this is the cause of the reported slowdown.

I tested the code using clang and gcc. Clang gave always the same productivity (12.5275 s per 2000000000 cycles), so I couldn't reproduce the described behavior, but gcc gave significant performance boost (8.31 vs. 10.42 s per 2000000000 cycles) when I marked the function as inline. So you try adding __attribute__((noinline)) to the function in the initial version (same compilation unit). If it decreases performance, than the root cause is inlining.

The test program I used:

random.hpp

#pragma once
unsigned long generate_random_number();

random.cpp

#define N            17U
#define M            13U

#define MATRIX_A     0x9908B0DFUL
#define UPPER_MASK   0x80000000UL
#define LOWER_MASK   0x7FFFFFFFUL

static unsigned long mt [ N ];
static int           mti = N + 1;

void init_genrand ( unsigned long ulSeed )
{

  mt [ 0 ]= ulSeed & 0xFFFFFFFFUL;

  for ( mti = 1; mti < int(N); mti++ )
  {

    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier.   */
    /* In the previous versions, MSBs of the seed affect     */
    /* only MSBs of the array mt[].                          */
    /* 2002/01/09 modified by Makoto Matsumoto               */

    mt [ mti ] = ( 1812433253UL * ( mt [ mti - 1 ] ^ ( mt [ mti - 1 ] >> 30 ) ) + mti );

    mt [ mti ] &= 0xFFFFFFFFUL;

    /* for >32 bit machines */
  }

}

#ifdef INLINE_THE_FUNCTION
inline
#endif
unsigned long generate_random_number()
{
  unsigned long y;
  static unsigned long mag01[2] = {0x0UL, MATRIX_A};

  // mag01[x] = x * MATRIX_A  for x=0,1
  if (mti >= int(N)) // generate N words at one time
  { 
    int kk;
    if (mti == N+1)   // if init_genrand() has not been called
      init_genrand(5489UL); // a default initial seed is used

    for (kk=0; kk<int(N-M); kk++) {
      y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
      mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
    }
    for (;kk<int(N-1); kk++) {
      y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
      mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
    }
    y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
    mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];

    mti = 0;
  }

  y = mt[mti++];
  // Tempering
  y ^= (y >> 11);
  y ^= (y << 7) & 0x9d2c5680UL;
  y ^= (y << 15) & 0xefc60000UL;
  y ^= (y >> 18);
  return y;
}

myprog.cpp

#ifdef SAME_COMPILATION_UNIT
#include "random.cpp"
#else
#include "random.hpp"
#endif

#include <iostream>
#include <chrono>

unsigned long calc(int hurry_ind)
{
  switch(hurry_ind)
  {
    case 0: return generate_random_number() % 19;
    case 1: return generate_random_number() % 100;
    case 2: return generate_random_number() % 9;
    case 3: return generate_random_number() % 914;
    case 4: return generate_random_number() % 355;
    case 5: return generate_random_number() % 348;
    case 6: return generate_random_number() % 65;
  }
  return 0;
}

int main(int argc, char** argv)
{
  int n = argc > 1 ? std::atol(argv[1]) : 0;
  int res = 0;
  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < n; ++i)
    res += calc(i % 7);
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end-start;
  std::cout << res << "(" << diff.count() << " s)\n";
}

The compilers I used:

clang++ -v
clang version 8.0.0 (tags/RELEASE_800/final)
Target: x86_64-w64-windows-gnu
Thread model: posix

gcc -v
Using built-in specs.
COLLECT_GCC=C:\GNU\msys64\mingw64\bin\gcc.exe
COLLECT_LTO_WRAPPER=C:/GNU/msys64/mingw64/bin/../lib/gcc/x86_64-w64-mingw32/8.2.1/lto-wrapper.exe
Target: x86_64-w64-mingw32
Configured with: ../gcc-8-20181214/configure --prefix=/mingw64 --with-local-prefix=/mingw64/local --build=x86_64-w64-mingw32 --host=x86_64-w64-mingw32 --target=x86_64-w64-mingw32 --with-native-system-header-dir=/mingw64/x86_64-w64-mingw32/include --libexecdir=/mingw64/lib --enable-bootstrap --with-arch=x86-64 --with-tune=generic --enable-languages=ada,c,lto,c++,objc,obj-c++,fortran --enable-shared --enable-static --enable-libatomic --enable-threads=posix --enable-graphite --enable-fully-dynamic-string --enable-libstdcxx-filesystem-ts=yes --enable-libstdcxx-time=yes --disable-libstdcxx-pch --disable-libstdcxx-debug --disable-isl-version-check --enable-lto --enable-libgomp --disable-multilib --enable-checking=release --disable-rpath --disable-win32-registry --disable-nls --disable-werror --disable-symvers --with-libiconv --with-system-zlib --with-gmp=/mingw64 --with-mpfr=/mingw64 --with-mpc=/mingw64 --with-isl=/mingw64 --with-pkgversion='Rev1, Built by MSYS2 project' --with-bugurl=https://sourceforge.net/projects/msys2 --with-gnu-as --with-gnu-ld
Thread model: posix
gcc version 8.2.1 20181214 (Rev1, Built by MSYS2 project)

Upvotes: 1

mksteve

Reputation: 13073

When the compiler sees 2 functions within the same source file (translation unit) it can create an implementation which allows for the registers to be optimized.

This knowledge of both the called, and the calling functions is essential for this form of optimization.

Upvotes: 0

When moving a function and it&#39;s implementation to a different file (.hpp and .cpp) from the main file, performance greatly suffers

Answers (2)

Related Questions

When moving a function and it's implementation to a different file (.hpp and .cpp) from the main file, performance greatly suffers