Reputation: 469

Optimization of loops and if

I have a procedure looks like this:

void Process1(unsigned char* data)
{

}
void Process2(unsigned char* data)
{

}
void Process3(unsigned char* data)
{

}

#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);
    for (unsigned int i = 0; i < bytes; i ++)
    {
        if (b1) Process1(data + i);
        if (b2) Process2(data + i);
        if (b3) Process3(data + i);
    }
}

As it looks, flags & FLAG1 A.K.A b1 will not change in all the loops. But we still have to do branch in every loop. I just wondering if there's a way to avoid this unnecessary branch dynamically.

here is a demo of Lundin's solution.

#include <windows.h>
#include <stdio.h>
#include <time.h>
LARGE_INTEGER ls, le, ll;
#define START_CLOCK() QueryPerformanceCounter(&ls)
#define END_CLOCK() printf ("%.0lf ns\n", (QueryPerformanceCounter(&le), ((double)le.QuadPart - ls.QuadPart) / ll.QuadPart * 1000000));


void Process1(unsigned char* data)
{
    (*data)++;
}
void Process2(unsigned char* data)
{
    (*data)--;
}
void Process3(unsigned char* data)
{
    (*data) *= (*data);
}

#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);
    for (unsigned int i = 0; i < bytes; i ++)
    {
        if (b1) Process1(data + i);
        if (b2) Process2(data + i);
        if (b3) Process3(data + i);
    }
}


typedef void (*proc_t)(unsigned char*);

inline static void do_nothing (unsigned char* ptr)
{
    (void)ptr;
}

void ProcessData_x(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = (flags & FLAG1) != 0;  // de-obfuscate the boolean logic
    bool b2 = (flags & FLAG2) != 0;
    bool b3 = (flags & FLAG3) != 0;

    proc_t p1 = b1 ? Process1 : do_nothing;
    proc_t p2 = b2 ? Process2 : do_nothing;
    proc_t p3 = b3 ? Process3 : do_nothing;

    for (unsigned int i = 0; i<bytes; i++)
    {
        p1(data + i);
        p2(data + i);
        p3(data + i);
    }
}

int main()
{
    if (!QueryPerformanceFrequency(&ll)) return 1;

    const unsigned int bytes = 0xffff;
    srand((unsigned int)time(NULL));
    unsigned int flags = rand() & 0x7;
    unsigned char* data = new unsigned char[bytes];
    for (unsigned int i = 0; i < bytes; i++)
    {
        data[i] = (unsigned char)(rand() & 0xff);
    }

    START_CLOCK();

    ProcessData(data, bytes, flags);

    END_CLOCK();

    START_CLOCK();

    ProcessData_x(data, bytes, flags);

    END_CLOCK();
}

here is the output:

134 ns
272 ns

I've run it several times but, unexpectedly, it costs even more time:(.. it is also compiled 'vs2010 Release x86'

Upvotes: 2

Answers (4)

mattnewport

Reputation: 14057

Here's another solution using templates - this way you'll get an optimized version of the inner loop for each variant. If the ProcessN functions are short / simple enough to be worth inlining then this could be a worthwhile optimization.

#include <tuple>
#include <map>
#include <utility>

using namespace std;

inline void Process1(unsigned char* data) {}
inline void Process2(unsigned char* data) {}
inline void Process3(unsigned char* data) {}

#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)

template <bool b1, bool b2, bool b3>
void ProcessData(unsigned char* data, unsigned int bytes) {
    for (unsigned int i = 0; i < bytes; i++) {
        if (b1) Process1(data + i);
        if (b2) Process2(data + i);
        if (b3) Process3(data + i);
    }
}

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags) {
    typedef void (*ProcessFunc)(unsigned char*, unsigned int bytes);
    static map<tuple<bool, bool, bool>, ProcessFunc> funcs{
        {make_tuple(false, false, false), ProcessData<false, false, false>},
        {make_tuple(false, false, true), ProcessData<false, false, true>},
        {make_tuple(false, true, false), ProcessData<false, true, false>},
        {make_tuple(false, true, true), ProcessData<false, true, true>},
        {make_tuple(true, false, false), ProcessData<true, false, false>},
        {make_tuple(true, false, true), ProcessData<true, false, true>},
        {make_tuple(true, true, false), ProcessData<true, true, false>},
        {make_tuple(true, true, true), ProcessData<true, true, true>}};

    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);
    funcs[make_tuple(b1, b2, b3)](data, bytes);
}

Upvotes: 0

eerorika

Reputation: 238351

A c++ solution. Similar to Lundin's answer but without calls to empty function. I'm not sure if that makes any difference in performance, the main advantage is that you don't need to manually list all the process calls in the loop. If you want to micro optimize or want c, you could use an array on stack, but you'll have to manage some counters yourself.

typedef void (*proc_t)(unsigned char*);
std::vector<proc_t> processes;
if (b1) processes.push_back(Process1);
if (b2) processes.push_back(Process2);
if (b3) processes.push_back(Process3);

for(auto p : processes)
    for (unsigned int i = 0; i<bytes; i++)
        p(data + i);

Upvotes: 3

huseyin tugrul buyukisik

Reputation: 11920

    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);


    int caseNow=SelectCaseAtOnce(b1,b2,b3);

    if(caseNow==0)
        for (unsigned int i = 0; i < bytes; i ++)
        {
            Process1(data + i);

        }
     else if(caseNow==1)
        for (unsigned int i = 0; i < bytes; i ++)
        {

           Process2(data + i);

        }
     else if(caseNow==2)
        for (unsigned int i = 0; i < bytes; i ++)
        {

             Process3(data + i);
        }
    else if(caseNow==3)
        for (unsigned int i = 0; i < bytes; i ++)
        {
            Process1(data + i);
            Process2(data + i);

        }
    if(caseNow==4)
        for (unsigned int i = 0; i < bytes; i ++)
        {
             Process1(data + i);

             Process3(data + i);
        }
    else if(caseNow==5)
        for (unsigned int i = 0; i < bytes; i ++)
        {

            Process2(data + i);
            Process3(data + i);
        }
    else if(caseNow==6)
        for (unsigned int i = 0; i < bytes; i ++)
        {
            Process1(data + i);
            Process2(data + i);
            Process3(data + i);
        }
    else {}

Upvotes: 0

Lundin

Reputation: 213842

First of all, it doesn't any sense to speak about optimizations without a particular system in mind...

That being said, I'd optimize away the branches in the following way:

typedef void (*proc_t)(unsigned char*);

inline static void do_nothing (unsigned char* ptr)
{
    (void)ptr;
}

...

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = (flags & FLAG1) != 0;  // de-obfuscate the boolean logic
    bool b2 = (flags & FLAG2) != 0;
    bool b3 = (flags & FLAG3) != 0;

    proc_t p1 = b1 ? Process1 : do_nothing;
    proc_t p2 = b2 ? Process2 : do_nothing;
    proc_t p3 = b3 ? Process3 : do_nothing;

    for (unsigned int i = 0; i<bytes; i++)
    {
        p1(data + i);
        p2(data + i);
        p3(data + i);
    }
}

Upvotes: 4

Optimization of loops and if

Answers (4)

Related Questions