Reputation: 11
It is well known that fixed point arithmetics is equavalent to integer arithmetics with some constraints. In this way I caught the issue. Let's see:
#include <stdio.h>
#include <stdint.h>
#define USE_C_FUNC 1
#if USE_C_FUNC != 0
uint64_t fixmd(uint64_t a, uint64_t b)
{
uint64_t c = a * b / 10000000000LL;
return c;
}
#else
uint64_t fixmd(uint64_t a, uint64_t b)
{
uint64_t c;
asm (
"mov %1, %%rax\n"
"mul %2\n"
"mov $10000000000, %%r8\n"
"div %%r8\n"
"mov %%rax, %0\n"
: "=r" (c)
: "r" (a), "r" (b)
: "rax", "rdx", "r8"
);
return c;
}
#endif
void main(void)
{
uint64_t x = 12589254118LL;
uint64_t y = fixmd(x, x);
printf("x=0x%llX=%llu, y=0x%llX=%llu\n", x, x, y, y);
}
In case #define USE_C_FUNC 1
program prints WRONG result:
x=0x2EE60C5E6=12589254118, y=0x410F8719=1091536665
In case #define USE_C_FUNC 0
program (with inline assembly) prints RIGHT result:
x=0x2EE60C5E6=12589254118, y=0x3B0AB8254=15848931924
I had compiling this example without any optimization (with gcc option -O0).
$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/10/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa:hsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 10.2.1-6' --with-bugurl=file:///usr/share/doc/gcc-10/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-10 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-10-Km9U7s/gcc-10-10.2.1/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-10-Km9U7s/gcc-10-10.2.1/debian/tmp-gcn/usr,hsa --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-mutex
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 10.2.1 20210110 (Debian 10.2.1-6)
objdump -D uint64
for WRONG result for function fixmd() is:
0000000000001135 <fixmd>:
1135: 55 push %rbp
1136: 48 89 e5 mov %rsp,%rbp
1139: 48 89 7d e8 mov %rdi,-0x18(%rbp)
113d: 48 89 75 e0 mov %rsi,-0x20(%rbp)
1141: 48 8b 45 e8 mov -0x18(%rbp),%rax
1145: 48 0f af 45 e0 imul -0x20(%rbp),%rax
114a: 48 ba bf d5 ed bd ce movabs $0xdbe6fecebdedd5bf,%rdx
1151: fe e6 db
1154: 48 f7 e2 mul %rdx
1157: 48 89 d0 mov %rdx,%rax
115a: 48 c1 e8 21 shr $0x21,%rax
115e: 48 89 45 f8 mov %rax,-0x8(%rbp)
1162: 48 8b 45 f8 mov -0x8(%rbp),%rax
1166: 5d pop %rbp
1167: c3 retq
objdump -D uint64
for GOOD result for function fixmd() is:
0000000000001135 <fixmd>:
1135: 55 push %rbp
1136: 48 89 e5 mov %rsp,%rbp
1139: 48 89 7d e8 mov %rdi,-0x18(%rbp)
113d: 48 89 75 e0 mov %rsi,-0x20(%rbp)
1141: 48 8b 4d e8 mov -0x18(%rbp),%rcx
1145: 48 8b 75 e0 mov -0x20(%rbp),%rsi
1149: 48 89 c8 mov %rcx,%rax
114c: 48 f7 e6 mul %rsi
114f: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8
1156: 00 00 00
1159: 49 f7 f0 div %r8
115c: 48 89 c1 mov %rax,%rcx
115f: 48 89 4d f8 mov %rcx,-0x8(%rbp)
1163: 48 8b 45 f8 mov -0x8(%rbp),%rax
1167: 5d pop %rbp
1168: c3 retq
In the code for fixmd() in C the problem is quite clean: gcc uses signed multiplication (imul) for unsigned values and substitute division with multiplication on the coefficient (with overflow) and binary shifting. Seems, it is the bug, is not it?
Upvotes: 1
Views: 698
Reputation: 225007
This is not a bug. Your expectations are incorrect.
Unsigned integer arithmetic is performed modulo the largest value the type can hold +1. Loosely speaking, this means that for a 64 bit type that anything that overflows 64 bits is truncated.
In your particular case, you're multiplying 12589254118 * 12589254118. The arithmetic result of this is 158,489,319,247,579,957,924. This is larger than what will fit in a 64 bit type, so the result is modulo 264 which gives us 10,915,366,657,903,544,996, and dividing by 10000000000 gives us 1,091,536,665 which matches what the C code generates.
gcc supports 128 bit types, so you can fix this by casting one of the operands to __int128
to perform the math with 128 bits.
uint64_t c = (unsigned __int128)a * b / 10000000000LL;
Upvotes: 5