Reputation: 4537
When sending a patch to a widely known open source project (known for its performance and simplicity), I received a review that was a bit surprising to me: 'using "bool" type from C99 is a bad idea'. They reasoned it very well, and I was shown a simple example program that showed that (unoptimized code) clearly had more instructions when using bool than when using an integer type.
So they basically use something like typedef unsigned int bool_t;
, and make sure they only assign 1 to that type.
I wanted to get a convincing and definitive answer to this, and also know what kind of performance difference are we talking about (i.e., is it worth it?), and see if compiler could do better with optimizations enabled.
There's a C++ question that is very related to this one, but (apart from being C++) that one restricts itself to the selection statement, while in this one I'm concerned about both aspects of bool
: assignment and selection. That related question is Which is faster : if (bool) or if(int)?
So, what is faster, bool
or an integer type? And how important is the performance difference?
Upvotes: 1
Views: 2020
Reputation: 4537
EDITED 2021-12-16 19:07: Show comparison against both uint
and uchar
, and also show both GCC and Clang. Add -march=native
to compiler flags. Now the results seem to show that bool
is as good as other integer types, but some compilers produce sub-optimal code.
EDITED 2022-01-11 18:56: After some tests, slightly changing the code can show important performance problems, more likely to be present with _Bool
than uint
.
For my tests, I chose unsigned types, since that's what the project was using instead of bool
, but I expect signed types to behave similarly.
I'll show here the tests with unsigned char
, since bool
is 1 byte in my system and that reduces the difference in assembly output, and also unsigned int
to compare different widths.
I tested storing an integer into one of these types (bool
, unsigned char
, and unsigned int
), using one of these types to control a selection statement, and using one of this types as a parameter of a function.
Source code:
// repeat.h:
#pragma once
#define repeat2(e) (e);(e)
#define repeat4(e) repeat2(e);repeat2(e)
#define repeat8(e) repeat4(e);repeat4(e)
#define repeat16(e) repeat8(e);repeat8(e)
#define repeat32(e) repeat16(e);repeat16(e)
#define repeat64(e) repeat32(e);repeat32(e)
#define repeat128(e) repeat64(e);repeat64(e)
#define repeat256(e) repeat128(e);repeat128(e)
#define repeat512(e) repeat256(e);repeat256(e)
#define repeat1024(e) repeat512(e);repeat512(e)
#define repeat(e) do \
{ \
repeat16(e); \
} while (0)
// store_bool.h:
#pragma once
_Bool store_bool(long n, int x);
// store_bool.c:
#include "store_bool.h"
#include "repeat.h"
_Bool store_bool(long n, volatile int x)
{
volatile _Bool b;
for (long i = 0; i < n; i++)
repeat(b = x);
return b;
}
// store_uchar.h:
#pragma once
unsigned char store_uchar(long n, int x);
// store_uchar.c:
#include "store_uchar.h"
#include "repeat.h"
unsigned char store_uchar(long n, volatile int x)
{
volatile unsigned char c;
for (long i = 0; i < n; i++)
repeat(c = x);
return c;
}
// store_uint.h:
#pragma once
unsigned int store_uint(long n, int x);
// store_uint.c:
#include "store_uint.h"
#include "repeat.h"
unsigned int store_uint(long n, volatile int x)
{
volatile unsigned int u;
for (long i = 0; i < n; i++)
repeat(u = x);
return u;
}
// consume_bool.h:
#pragma once
int consume_bool(long n, _Bool b);
// consume_bool.c:
#include "consume_bool.h"
#include "repeat.h"
int consume_bool(long n, volatile _Bool b)
{
volatile int x = 5;
for (long i = 0; i < n; i++)
repeat({if (b) x = 3;});
return x;
}
// consume_uchar.h:
#pragma once
int consume_uchar(long n, unsigned char u);
// consume_uchar.c:
#include "consume_uchar.h"
#include "repeat.h"
int consume_uchar(long n, volatile unsigned char c)
{
volatile int x = 5;
for (long i = 0; i < n; i++)
repeat({if (c) x = 3;});
return x;
}
// consume_uint.h:
#pragma once
int consume_uint(long n, unsigned int u);
// consume_uint.c:
#include "consume_uint.h"
#include "repeat.h"
int consume_uint(long n, volatile unsigned int u)
{
volatile int x = 5;
for (long i = 0; i < n; i++)
repeat({if (u) x = 3;});
return x;
}
// param_bool_.h:
#pragma once
int param_bool_(_Bool x);
// param_bool_.c:
#include "param_bool_.h"
int param_bool_(_Bool b)
{
return b ? 3 : 5;
}
// param_bool.h:
#pragma once
void param_bool(long n, _Bool b);
// param_bool.c:
#include "param_bool.h"
#include "param_bool_.h"
#include "repeat.h"
void param_bool(long n, volatile _Bool b)
{
for (long i = 0; i < n; i++)
repeat(param_bool_(b));
}
// param_uchar_.h:
#pragma once
int param_uchar_(unsigned char c);
// param_uchar_.c:
#include "param_uchar_.h"
int param_uchar_(unsigned char c)
{
return c ? 3 : 5;
}
// param_uchar.h:
#pragma once
void param_uchar(long n, unsigned char c);
// param_uchar.c:
#include "param_uchar.h"
#include "param_uchar_.h"
#include "repeat.h"
void param_uchar(long n, volatile unsigned char c)
{
for (long i = 0; i < n; i++)
repeat(param_bool_(c));
}
// param_uint_.h:
#pragma once
int param_uint_(unsigned int u);
// param_uint_.c:
#include "param_uint_.h"
int param_uint_(unsigned int u)
{
return u ? 3 : 5;
}
// param_uint.h:
#pragma once
void param_uint(long n, unsigned int u);
// param_uint.c:
#include "param_uint.h"
#include "param_uint_.h"
#include "repeat.h"
void param_uint(long n, volatile unsigned int u)
{
for (long i = 0; i < n; i++)
repeat(param_bool_(u));
}
// main.c:
#include <stdio.h>
#include <time.h>
#include "store_bool.h"
#include "store_uchar.h"
#include "store_uint.h"
#include "consume_bool.h"
#include "consume_uchar.h"
#include "consume_uint.h"
#include "param_bool.h"
#include "param_uchar.h"
#include "param_uint.h"
#define measure(e) \
({ \
clock_t t0, t1; \
double t; \
\
t0 = clock(); \
e; \
t1 = clock(); \
\
t = (double) (t1 - t0) / CLOCKS_PER_SEC; \
t; \
})
int main(int argc, char *argv[])
{
double sb, sc, su;
double cb, cc, cu;
double pb, pc, pu;
long n;
if (argc != 2)
exit(2);
n = atol(argv[1]);
sb = measure(store_bool(n, 1));
sc = measure(store_uchar(n, 1));
su = measure(store_uint(n, 1));
cb = measure(consume_bool(n, 1));
cc = measure(consume_uchar(n, 1));
cu = measure(consume_uint(n, 1));
pb = measure(param_bool(n, 1));
pc = measure(param_uchar(n, 1));
pu = measure(param_uint(n, 1));
printf("n: %li\n", n);
putchar('\n');
printf("store bool: %lf\n", sb);
printf("store uchar: %lf\n", sc);
printf("store uint: %lf\n", su);
putchar('\n');
printf("consume bool: %lf\n", cb);
printf("consume uchar: %lf\n", cc);
printf("consume uint: %lf\n", cu);
putchar('\n');
printf("param bool: %lf\n", pb);
printf("param uchar: %lf\n", pc);
printf("param uint: %lf\n", pu);
}
I used volatile
for some variables, to avoid the compiler optimizing out the multiple assignments and tests.
Since the compiler will not unroll the loops, as they are huge, I used many (16) repeated expressions in each loop (see the repeat() macro), to reduce the impact of the loop overhead (jump instructions) in the total benchmark time.
Compiling:
$ cc -Wall -Wextra -O3 -march=native -S *.c
$ cc -O3 -march=native *.s
$
Assembly:
I'll cherry-pick a single one of the 16 repetitions, to simplify. If you want to see full assembly files, you can compile them yourself (I gave here enough instructions).
// store_bool.s (GCC):
movl -20(%rsp), %edx
testl %edx, %edx
setne %dl
movb %dl, -1(%rsp)
// store_bool.s (Clang):
cmpl $0, -4(%rsp)
setne -5(%rsp)
// sotre_uchar.s (GCC):
movl -20(%rsp), %edx
movb %dl, -1(%rsp)
// store_uchar.s (Clang):
movl -4(%rsp), %ecx
movb %cl, -5(%rsp)
// store_uint.s (GCC):
movl -20(%rsp), %edx
movl %edx, -4(%rsp)
// store_uint.s (Clang):
movl -4(%rsp), %ecx
movl %ecx, -8(%rsp)
From the above, uchar
and uint
are likely to be the same. bool
has two instructions too on Clang, but they are different; that may or may not make a difference. On GCC, it clearly has 2 extra instructions compared to uchar
which makes it slower.
// consume_bool.s (GCC):
movzbl -20(%rsp), %edx
testb %dl, %dl
je .L2
movl $3, -4(%rsp)
.L2:
// consume_bool.s (Clang):
.LBB0_5: # in Loop: Header=BB0_1 Depth=1
testb $1, -5(%rsp)
jne .LBB0_6
[...]
.LBB0_6: # in Loop: Header=BB0_1 Depth=1
movl $3, -4(%rsp)
testb $1, -5(%rsp)
je .LBB0_9
(LBB0_9 is similar to LBB0_5)
// consume_uchar.s (GCC):
movzbl -20(%rsp), %edx
testb %dl, %dl
je .L2
movl $3, -4(%rsp)
.L2:
// consume_uchar.s (Clang):
cmpb $0, -5(%rsp)
je .LBB0_3
# %bb.2: # in Loop: Header=BB0_1 Depth=1
movl $3, -4(%rsp)
.LBB0_3: # in Loop: Header=BB0_1 Depth=1
// consume_uint.s (GCC):
movl -20(%rsp), %edx
testl %edx, %edx
je .L2
movl $3, -4(%rsp)
.L2:
// consume_uint.s (Clang):
cmpl $0, -4(%rsp)
je .LBB0_3
# %bb.2: # in Loop: Header=BB0_1 Depth=1
movl $3, -8(%rsp)
.LBB0_3: # in Loop: Header=BB0_1 Depth=1
In these cases, the assembly produced by GCC is almost identical for the 3 types, so I don't expect any difference. In Clang, bool
has different code, but since it's very different, it's hard to predict if it will be faster or slower than the integers.
// param_bool_.s (GCC):
param_bool_:
.LFB0:
.cfi_startproc
cmpb $1, %dil
sbbl %eax, %eax
andl $2, %eax
addl $3, %eax
ret
.cfi_endproc
.LFE0:
// param_bool_.s (Clang):
param_bool_: # @param_bool_
.cfi_startproc
# %bb.0:
xorb $1, %dil
movzbl %dil, %eax
addl %eax, %eax
addl $3, %eax
retq
.Lfunc_end0:
// param_bool.s (GCC):
movzbl 12(%rsp), %edi
call param_bool_@PLT
// param_bool.s (Clang):
movzbl 15(%rsp), %edi
andl $1, %edi
callq param_bool_
// param_uchar_.s (GCC):
param_uchar_:
.LFB0:
.cfi_startproc
cmpb $1, %dil
sbbl %eax, %eax
andl $2, %eax
addl $3, %eax
ret
.cfi_endproc
.LFE0:
// param_uchar_.s (Clang):
param_uchar_: # @param_uchar_
.cfi_startproc
# %bb.0:
xorl %eax, %eax
testl %edi, %edi
sete %al
addl %eax, %eax
addl $3, %eax
retq
.Lfunc_end0:
// param_uchar.s (GCC):
movzbl 12(%rsp), %edi
call param_uchar_@PLT
// param_uchar.s (Clang):
movzbl 15(%rsp), %edi
callq param_uchar_
// param_uint_.s (GCC):
param_uint_:
.LFB0:
.cfi_startproc
cmpl $1, %edi
sbbl %eax, %eax
andl $2, %eax
addl $3, %eax
ret
.cfi_endproc
.LFE0:
// param_uint_.s (Clang):
param_uint_: # @param_uint_
.cfi_startproc
# %bb.0:
xorl %eax, %eax
testl %edi, %edi
sete %al
addl %eax, %eax
addl $3, %eax
retq
.Lfunc_end0:
// param_uint.s (GCC):
movl 12(%rsp), %edi
call param_uint_@PLT
// param_uint.s (Clang):
movl 12(%rsp), %edi
callq param_uint_
In this case, bool
should be the same as uchar
since the only important thing should be the width, and we might see (or not) a difference with uint
. A part from zero extending, there's not much difference. There are slight differences between GCC and Clang, however, Clang producing larger code, so I expect Clang to run slightly slower than GCC.
Timing:
// amd64, gcc-11, i5-5675C:
$ ./a.out 1073741824
store bool: 4.928789
store uchar: 4.795028
store uint: 4.803893
consume bool: 4.795776
consume uchar: 4.794873
consume uint: 4.794079
param bool: 17.713958
param uchar: 17.611229
param uint: 17.688909
// amd64, clang-13, i5-5675C:
$ ./a.out 1073741824
store bool: 4.806418
store uchar: 4.802943
store uint: 4.800172
consume bool: 4.805537
consume uchar: 4.799858
consume uint: 4.799462
param bool: 19.095543
param uchar: 17.708014
param uint: 17.782490
In 'store', as we expected, bool
is slower than the other types with GCC (around 1~10%). With Clang, there's no significant difference (I've seen bool
consistently be a bit slower than the others, but less than 0.5%).
In 'consume', we see no difference between types or compilers.
In 'param', times vary a lot between runs, and there's no consistency: sometimes bool
is slower, and sometimes it's faster. However, GCC is consistently faster than Clang.
Slight changes in the code can lead to compilers missing important optimizations. Using the following code in consume_<type>.c
, leads to some important performance loss:
repeat(x = b ? 3 : x);
Note that just by changing an if to a ternary operator, makes the compiler slow down to the following times:
GCC:
$ ./a.out 1073741824
n: 1073741824
...
consume bool: 8.684662
consume uchar: 8.683915
consume uint: 8.086806
...
Clang:
$ ./a.out 1073741824
n: 1073741824
...
consume bool: 8.161896
consume uchar: 5.422896
consume uint: 5.127165
...
Clang slows down considerably for _Bool
, while maintaining a reasonable speed for other types. GCC seems to generate quite bad code for all the types.
Conclusion:
Programmers should consider a few things:
Performance: Even though _Bool
may be theoretically as fast as unsigned int
, compilers are far from being ideal, and it is likely that your compiler will miss some optimizations, which in some cases may be quite important.
Maintainability/readability/correctness: Some may argue that _Bool
is safer due to autonormalization; others may argue that it is less safe due to autonormalization; just know what you're using, and form your own opinion.
Supporting pre-C99 code: If that's the case, you have no choice but using unsigned int
.
Upvotes: 4