Reputation: 33679
GCC has 128-bit integers. Using these I can get the compiler to use the mul
(or imul
with only one operand) instructions. For example
uint64_t x,y;
unsigned __int128 z = (unsigned __int128)x*y;
produces mul
. I have used this to create a 128x128 to 256 function (see the end of this question, before the update, for code for that if you're interested).
Now I want to do 256-bit addition and I have not found a way to get the compiler to use ADC
except by using assembly. I could use an assembler but I want inline functions for efficiency. The compiler already produces an efficient 128x128 to 256 function (for the reason I explained at the start of this question) so I don't see why I should rewrite this in assembly as well (or any other functions which the compiler already implements efficiently).
Here is the inline assembly function I have come up with:
#define ADD256(X1, X2, X3, X4, Y1, Y2, Y3, Y4) \
__asm__ __volatile__ ( \
"addq %[v1], %[u1] \n" \
"adcq %[v2], %[u2] \n" \
"adcq %[v3], %[u3] \n" \
"adcq %[v4], %[u4] \n" \
: [u1] "+&r" (X1), [u2] "+&r" (X2), [u3] "+&r" (X3), [u4] "+&r" (X4) \
: [v1] "r" (Y1), [v2] "r" (Y2), [v3] "r" (Y3), [v4] "r" (Y4))
(probably not every output needs a early clobber modifier but I get the wrong result without at least the last two). (editor's note: the last output isn't written until all inputs have been read, and would be safe to not declare as early-clobber.)
And here is a function which does the same thing in C
void add256(int256 *x, int256 *y) {
uint64_t t1, t2;
t1 = x->x1; x->x1 += y->x1;
t2 = x->x2; x->x2 += y->x2 + ((x->x1) < t1);
t1 = x->x3; x->x3 += y->x3 + ((x->x2) < t2);
x->x4 += y->x4 + ((x->x3) < t1);
}
Why is assembly necessary for this? Why can't the compiler compile the add256
function to use the carry flags? Is there a way to coerce the compiler to do this (e.g. can I change add256
so that it does this)? What is someone suppose to do for a compiler which does not support inline assembly (write all the functions in assembly?) Why are there no intrinsic for this?
Here is the 128x128 to 256 function
void muldwu128(int256 *w, uint128 u, uint128 v) {
uint128 t;
uint64_t u0, u1, v0, v1, k, w1, w2, w3;
u0 = u >> 64L;
u1 = u;
v0 = v >> 64L;
v1 = v;
t = (uint128)u1*v1;
w3 = t;
k = t >> 64L;
t = (uint128)u0*v1 + k;
w2 = t;
w1 = t >> 64L;
t = (uint128)u1*v0 + w2;
k = t >> 64L;
w->hi = (uint128)u0*v0 + w1 + k;
w->lo = (t << 64L) + w3;
}
Some type defines:
typedef __int128 int128;
typedef unsigned __int128 uint128;
typedef union {
struct {
uint64_t x1;
uint64_t x2;
int64_t x3;
int64_t x4;
};
struct {
uint128 lo;
int128 hi;
};
} int256;
Update:
My question is largely a duplicate of these questions:
Intel has a good article (New Instructions Support Large Integer Arithmetic) which discusses large integer arithmetic and the three new instructions MULX, ADCX, ADOX. They write:
intrinsic definitions of mulx, adcx and adox will also be integrated into compilers. This is the first example of an “add with carry” type instruction being implemented with intrinsics. The intrinsic support will enable users to implement large integer arithmetic using higher level programming languages such as C/C++.
The intrinsics are
unsigned __int64 umul128(unsigned __int64 a, unsigned __int64 b, unsigned __int64 * hi);
unsigned char _addcarry_u64(unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 *out);
unsigned char _addcarryx_u64(unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 *out);
Incidentally, MSVC already has a _umul128
intrinsic. So even though MSVC does not have __int128
the _umul128
intrinsic can be used to generate mul
and therefore 128 bit multiplication.
MULX
is in BMI2 (Haswell). The ADCX
and ADOX
instructions are available since Broadwell, as the ADX extension. It's too bad there is no intrinsic for ADC
which has been available since the 8086 in 1979. That would solve the inline assembly problem.
(Editor's note: Intel's intrinsics guide does define _addcarry_u64
for baseline x86-64, but perhaps not all compilers implemented it. However, gcc typically compiles it and/or _addcarryx
inefficiently, often spilling CF to an integer with setc
instead of ordering instructions better.)
GCC's __int128
codegen will use mulx
if BMI2 is enabled (e.g. using -mbmi2
or -march=haswell
).
Edit:
I tried the Clang's add with carry builtins as suggested by Lưu Vĩnh Phúc
void add256(int256 *x, int256 *y) {
unsigned long long carryin=0, carryout;
x->x1 = __builtin_addcll(x->x1, y->x1, carryin, &carryout); carryin = carryout;
x->x2 = __builtin_addcll(x->x2, y->x2, carryin, &carryout); carryin = carryout;
x->x3 = __builtin_addcll(x->x3, y->x3, carryin, &carryout); carryin = carryout;
x->x4 = __builtin_addcll(x->x4, y->x4, carryin, &carryout);
}
but this does not generated ADC
and it's more complicated than I expect.
Upvotes: 17
Views: 4564
Reputation: 570
Additions and carries ...... addcarry_u64 and subborrow_u64 and addcarryx_u64 generate sub-optimal idiotic code. This thread is 8 years old and this topic is still relevant.
Curiously clang from 10.0.0 to 16.0.0 with option -O3 generates the optimal code from the following C code (but latest clang 17.0.0 misses it)
#include <stdint.h>
#include <stdbool.h>
#include <x86intrin.h>
typedef unsigned __int128 uint128_t;
typedef union {
struct {
uint64_t x1;
uint64_t x2;
uint64_t x3;
uint64_t x4;
};
struct {
uint128_t lo;
uint128_t hi;
};
} uint256;
bool addc(uint64_t *r, uint64_t x, uint64_t y, bool b)
{
uint64_t t1 = x + y;
bool c = (t1 < x);
uint64_t t2 = t1 + b;
c |= (t2 < t1);
*r = t2;
return c;
}
bool add128(uint64_t *r_lo, uint64_t *r_hi, uint64_t x_lo, uint64_t x_hi, uint64_t y_lo, uint64_t y_hi)
{
bool c = addc(r_lo, x_lo, y_lo, 0);
return addc(r_hi, x_hi, y_hi, c);
}
bool add256(uint256 & r, const uint256 & x, const uint256 & y)
{
bool c = addc(&r.x1, x.x1, y.x1, 0);
c = addc(&r.x2, x.x2, y.x2, c);
c = addc(&r.x3, x.x3, y.x3, c);
return addc(&r.x4, x.x4, y.x4, c);
}
is compiled as follows, with expected carry propagation
addc(unsigned long*, unsigned long, unsigned long, bool):
add cl, -1
adc rsi, rdx
setb al
mov qword ptr [rdi], rsi
ret
add128(unsigned long*, unsigned long*, unsigned long, unsigned long, unsigned long, unsigned long):
add rdx, r8
adc rcx, r9
mov qword ptr [rdi], rdx
setb al
mov qword ptr [rsi], rcx
ret
add256(uint256&, uint256 const&, uint256 const&):
mov rax, qword ptr [rdx]
add rax, qword ptr [rsi]
mov qword ptr [rdi], rax
mov rax, qword ptr [rdx + 8]
adc rax, qword ptr [rsi + 8]
mov qword ptr [rdi + 8], rax
mov rax, qword ptr [rdx + 16]
adc rax, qword ptr [rsi + 16]
mov qword ptr [rdi + 16], rax
mov rcx, qword ptr [rdx + 24]
adc rcx, qword ptr [rsi + 24]
setb al
mov qword ptr [rdi + 24], rcx
ret
Upvotes: 2
Reputation: 33679
I found a solution with ICC 13.0.01 using the _addcarry_u64
intrinsic
void add256(uint256 *x, uint256 *y) {
unsigned char c = 0;
c = _addcarry_u64(c, x->x1, y->x1, &x->x1);
c = _addcarry_u64(c, x->x2, y->x2, &x->x2);
c = _addcarry_u64(c, x->x3, y->x3, &x->x3);
_addcarry_u64(c, x->x4, y->x4, &x->x4);
}
produces
L__routine_start_add256_0:
add256:
xorl %r9d, %r9d #25.9
movq (%rsi), %rax #22.9
addq %rax, (%rdi) #22.9
movq 8(%rsi), %rdx #23.9
adcq %rdx, 8(%rdi) #23.9
movq 16(%rsi), %rcx #24.9
adcq %rcx, 16(%rdi) #24.9
movq 24(%rsi), %r8 #25.9
adcq %r8, 24(%rdi) #25.9
setb %r9b #25.9
ret #26.1
I compiled with -O3
. I don't know how to enable adx
with ICC. Maybe I need ICC 14?
That's exactly 1 addq
and three adcq
like I expect.
With Clang the result using -O3 -madx
is a mess
add256(uint256*, uint256*): # @add256(uint256*, uint256*)
movq (%rsi), %rax
xorl %ecx, %ecx
xorl %edx, %edx
addb $-1, %dl
adcq %rax, (%rdi)
addb $-1, %cl
movq (%rdi), %rcx
adcxq %rax, %rcx
setb %al
movq 8(%rsi), %rcx
movb %al, %dl
addb $-1, %dl
adcq %rcx, 8(%rdi)
addb $-1, %al
movq 8(%rdi), %rax
adcxq %rcx, %rax
setb %al
movq 16(%rsi), %rcx
movb %al, %dl
addb $-1, %dl
adcq %rcx, 16(%rdi)
addb $-1, %al
movq 16(%rdi), %rax
adcxq %rcx, %rax
setb %al
movq 24(%rsi), %rcx
addb $-1, %al
adcq %rcx, 24(%rdi)
retq
Without enabling -madx
in Clang the result is not much better.
Edit:
Apperently MSVC already has _addcarry_u64
. I tried it and it's as good as ICC (1x add
and 3x adc
).
Upvotes: 4