Vaibhav
Vaibhav

Reputation: 37

completely remove function call at runtime in C

Is it possible to completely remove function call from C code at runtime and insert it back when needed.

I'm not sure if ELF can be modified at run time, so that no cpu cycle is wasted incase of no use of function.

I don't want to place a 'if' check before the function call to avoid calling a function.

For example if global flag g_flg=1 then func1 should look like below

void func1(int x)
{
 /* some processing */

 func2(y);

 /* some processing */

}

if global g_flag=0 then func1 should look like below

void func1(int x)
{
 /* some processing */

  /* some processing */

}

Upvotes: 1

Views: 917

Answers (3)

Nominal Animal
Nominal Animal

Reputation: 39298

On most desktop and server architectures branching is faster than indirect calls, since they do branch prediction and/or speculative execution. I have never heard of an architecture where indirect call is faster than a single branch. (Jump tables, for switch() statements, have more than one branch, and are therefore a different thing altogether.)

Consider the following microbenchmark I threw together. test.c:

/* test.c */

volatile long test_calls = 0L;
volatile long test_sum = 0L;

void test(long counter)
{
    test_calls++;
    test_sum += counter;
}

work.c:

/* work.c */

void test(long counter);

/* Work function, to be measured */
void test_work(long counter, int flag)
{
    if (flag)
        test(counter);
}

/* Dummy function, to measure call overhead */
void test_none(long counter __attribute__((unused)), int flag __attribute__((unused)) )
{
    return;
}

and harness.c:

#define  _POSIX_C_SOURCE 200809L
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>

/* From test.c */
extern volatile long test_calls;
extern volatile long test_sum;

/* Dummy function, to measure call overhead */
void test_none(long counter, int flag);

/* Work function, to be measured */
void test_work(long counter, int flag);

/* Timing harness -- GCC x86; modify for other architectures */
struct timing {
    struct timespec  wall_start;
    struct timespec  wall_stop;
    uint64_t         cpu_start;
    uint64_t         cpu_stop;
};

static inline void start_timing(struct timing *const mark)
{
    clock_gettime(CLOCK_REALTIME, &(mark->wall_start));
    mark->cpu_start = __builtin_ia32_rdtsc();
}

static inline void stop_timing(struct timing *const mark)
{
    mark->cpu_stop = __builtin_ia32_rdtsc();
    clock_gettime(CLOCK_REALTIME, &(mark->wall_stop));
}

static inline double cpu_timing(const struct timing *const mark)
{
    return (double)(mark->cpu_stop - mark->cpu_start); /* Cycles */
}

static inline double wall_timing(const struct timing *const mark)
{
    return (double)(mark->wall_stop.tv_sec - mark->wall_start.tv_sec)
         + (double)(mark->wall_stop.tv_nsec - mark->wall_start.tv_nsec) / 1000000000.0;
}

static int cmpdouble(const void *aptr, const void *bptr)
{
    const double a = *(const double *)aptr;
    const double b = *(const double *)bptr;

    if (a < b)
        return -1;
    else
    if (a > b)
        return +1;
    else
        return  0;
}

void report(double *const wall, double *const cpu, const size_t count)
{
    printf("\tInitial call: %.0f cpu cycles, %.9f seconds real time\n", cpu[0], wall[0]);

    qsort(wall, count, sizeof (double), cmpdouble);
    qsort(cpu, count, sizeof (double), cmpdouble);

    printf("\tMinimum:      %.0f cpu cycles, %.9f seconds real time\n", cpu[0], wall[0]);
    printf("\t5%% less than  %.0f cpu cycles, %.9f seconds real time\n", cpu[count/20], wall[count/20]);
    printf("\t25%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count/4], wall[count/4]);
    printf("\tMedian:       %.0f cpu cycles, %.9f seconds real time\n", cpu[count/2], wall[count/2]);
    printf("\t75%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count-count/4-1], wall[count-count/4-1]);
    printf("\t95%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count-count/20-1], wall[count-count/20-1]);
    printf("\tMaximum:      %.0f cpu cycles, %.9f seconds real time\n", cpu[count-1], wall[count-1]);
}

int main(int argc, char *argv[])
{
    struct timing    measurement;
    double      *wall_seconds = NULL;
    double      *cpu_cycles = NULL;
    unsigned long    count = 0UL;
    unsigned long    i;
    int      flag;
    char         dummy;

    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s COUNT FLAG\n", argv[0]);
        fprintf(stderr, "\n");
        return 1;
    }

    if (sscanf(argv[1], " %lu %c", &count, &dummy) != 1) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[1]);
        return 1;
    }
    if (count < 1UL) {
        fprintf(stderr, "%s: COUNT is too small.\n", argv[1]);
        return 1;
    }
    if (!(unsigned long)(count + 1UL)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[1]);
        return 1;
    }

    if (sscanf(argv[2], " %d %c", &flag, &dummy) != 1) {
        fprintf(stderr, "%s: Invalid FLAG.\n", argv[2]);
        return 1;
    }

    wall_seconds = malloc(sizeof (double) * (size_t)count);
    cpu_cycles = malloc(sizeof (double) * (size_t)count);
    if (!wall_seconds || !cpu_cycles) {
        free(cpu_cycles);
        free(wall_seconds);
        fprintf(stderr, "Cannot allocate enough memory. Try smaller COUNT.\n");
        return 1;
    }

    printf("Call and measurement overhead:\n");
    fflush(stdout);
    for (i = 0UL; i < count; i++) {

        start_timing(&measurement);
        test_none(i, flag);
        stop_timing(&measurement);

        wall_seconds[i] = wall_timing(&measurement);
        cpu_cycles[i] = cpu_timing(&measurement);
    }
    report(wall_seconds, cpu_cycles, (size_t)count);

    printf("\n");
    printf("Measuring FLAG==0 calls: ");
    fflush(stdout);
    test_calls = 0L;
    test_sum = 0L;
    for (i = 0UL; i < count; i++) {

        start_timing(&measurement);
        test_work(i, 0);
        stop_timing(&measurement);

        wall_seconds[i] = wall_timing(&measurement);
        cpu_cycles[i] = cpu_timing(&measurement);
    }
    printf("%ld calls, sum %ld.\n", test_calls, test_sum);
    report(wall_seconds, cpu_cycles, (size_t)count);

    printf("\n");
    printf("Measuring FLAG==%d calls:", flag);
    fflush(stdout);
    test_calls = 0L;
    test_sum = 0L;
    for (i = 0UL; i < count; i++) {

        start_timing(&measurement);
        test_work(i, flag);
        stop_timing(&measurement);

        wall_seconds[i] = wall_timing(&measurement);
        cpu_cycles[i] = cpu_timing(&measurement);
    }
    printf("%ld calls, sum %ld.\n", test_calls, test_sum);
    report(wall_seconds, cpu_cycles, (size_t)count);


    printf("\n");
    printf("Measuring alternating FLAG calls: ");
    fflush(stdout);
    test_calls = 0L;
    test_sum = 0L;
    for (i = 0UL; i < count; i++) {

        start_timing(&measurement);
        test_work(i, i & 1);
        stop_timing(&measurement);

        wall_seconds[i] = wall_timing(&measurement);
        cpu_cycles[i] = cpu_timing(&measurement);
    }
    printf("%ld calls, sum %ld.\n", test_calls, test_sum);
    report(wall_seconds, cpu_cycles, (size_t)count);

    printf("\n");
    free(cpu_cycles);
    free(wall_seconds);
    return 0;
}

Put the three files in an empty directory, then compile and build ./call-test:

rm -f *.o
gcc -W -Wall -O3 -fomit-frame-pointer -c harness.c
gcc -W -Wall -O3 -fomit-frame-pointer -c work.c
gcc -W -Wall -O3 -fomit-frame-pointer -c test.c
gcc harness.o work.o test.o -lrt -o call-test

On AMD Athlon II X4 640, using gcc-4.6.3 (Xubuntu 10.04), running

./call-test 1000000 1

tells me that the overhead is just 2 clock cycles (< 1ns) for the test alone (branch not taken), and just 4 clock cycles (just over a nanosecond) when calling the second function which increases test_calls and adds the counter to test_sum.

When omitting all optimizations (use -O0 and leave out -fomit-frame-pointer when compiling), the test alone costs about 3 clock cycles (3 cycles if branch not taken), and about 9 cycles if the branch is taken and the work is done to update the two extra variables.

(The two extra variables let you easily see that the harness does actually do all it should do; they're just an extra check. And I wanted to have some work in the second function, so the timing differences would be easier to spot.)

The above interpretation is only valid for the case when the code is already cached; i.e. run recently. If the code is run only rarely, it won't be in cache. However, then the test overhead matters even less. Caching effects -- for example, if "nearby" code has been run (you can see this for the call overhead measurement, the other test functions code' tends to get cached too!) -- are much larger anyway. (While the test harness does produce the initial call results separately, don't put too much faith in it, since it does not try to clear any caches in any way.)

My conclusion is that adding

if (flag)
    debug_function_call();

to any normal code is perfectly fine: the overhead is literally neglible; practically irrelevant. As always, consider the overall algorithm instead. Any enhancements in the algorithm yield much bigger rewards than worrying about the code the compiler generates.

(Since I wrote the test code above at one sitting, there are likely some bugs and/or brainfarts in them. Check, and if you find any, let me know below so I can fix the code.)

Upvotes: 0

Evren Kuzucuoglu
Evren Kuzucuoglu

Reputation: 3886

Don't optimize something that doesn't need it. Have you tried assessing the potential improvement on your performance?

Try setting g_flg to 1 and execute this:

if (g_flg == 1) {func2(y);}

Then try executing this:

func2(y);

Both 1 million times (or whatever number of times you can run it in a reasonable time). I'm quite sure you'll notice there is virtually no difference between both things.

Plus, apart from that, I think what you want to do is impossible, because ELF is a binary (compiled) format.

Upvotes: 2

Wug
Wug

Reputation: 13196

What you could probably get away with doing instead would be something like this:

struct Something;
typedef struct Something Something;

int myFunction(Something * me, int i)
{
    // do a bunch of stuff
    return 42; // obviously the answer
}

int myFunctionDoNothing(Something * dummy1, int dummy2)
{
    return 0;
}

int (*function)(Something *, int) = myFunctionDoNothing;

// snip to actual use of function

int i;

function = myFunctionDoNothing;
for (i = 0; i < 100000; ++i) function(NULL, 5 * i); // does nothing

function = myFunction;
for (i = 0; i < 100000; ++i) function(NULL, 5 * i); // does something

WARNING

This might be a premature optimization. Depending on how your compiler treats this and how your cpu handles branching, you might actually lose performance this way as opposed to the naive way (stopping it in the function with a flag)

Upvotes: 1

Related Questions