Reputation: 534
How would you efficiently implement 64 bit binary addition carrying from left to right? Like flipping the bits then adding:
0101 + 1110 = BitReverse(1010 + 0111)
= BitReverse(10001)
= 10001
My current solution is to reverse the bit order of the inputs using delta swaps (and possibly a byteswap intrinsic), use normal addition, then reverse again, which is not particularly fast, but probably better than looping for 64-bit integers. (I haven't tested it but it would still be too slow.)
uint64_t addreverse(uint64_t a, uint64_t b) {
return BitReverse(BitReverse(a) + BitReverse(b));
}
This is quite slow, as the bits need to be reversed three times, requiring over 40 operations when using byteswap.
EDIT: I cannot store them reversed as I require regular addition as well.
Upvotes: 3
Views: 946
Reputation: 51835
I would use LUT tables. Here small C++ 32bit (DOWRD = unsigned __int32_t
) example:
//---------------------------------------------------------------------------
// select fastest operation
#define BitReverse BitReverse1
#define AddReverse AddReverseKS
//---------------------------------------------------------------------------
DWORD BitReverseSlow(DWORD x) // Slow 1bit reversing
{
DWORD m0,m1,y;
for (y=0,m0=1,m1=0x80000000;m0<m1;m0<<=1,m1>>=1)
{
if (DWORD(m0&x)) y|=m1;
if (DWORD(m1&x)) y|=m0;
}
return y;
}
//---------------------------------------------------------------------------
DWORD BitReverse1(DWORD x)
{
x=((x<<16)&0xFFFF0000)|((x>>16)&0x0000FFFF);
x=((x<< 8)&0xFF00FF00)|((x>> 8)&0x00FF00FF);
x=((x<< 4)&0xF0F0F0F0)|((x>> 4)&0x0F0F0F0F);
x=((x<< 2)&0xCCCCCCCC)|((x>> 2)&0x33333333);
x=((x<< 1)&0xAAAAAAAA)|((x>> 1)&0x55555555);
return x;
}
//---------------------------------------------------------------------------
DWORD BitReverse8(DWORD x) // fast 8bit LUT reversing
{
DWORD y;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
static const BYTE BitReverse8_LUT[256]= // BitReverse8_LUT[x]=BitReverse(x) in 8bits
{
0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF
};
py[0]=BitReverse8_LUT[px[3]];
py[1]=BitReverse8_LUT[px[2]];
py[2]=BitReverse8_LUT[px[1]];
py[3]=BitReverse8_LUT[px[0]];
return y;
}
//---------------------------------------------------------------------------
DWORD AddReverseSlow(DWORD x,DWORD y)
{
return BitReverse(BitReverse(x)+BitReverse(y));
}
//---------------------------------------------------------------------------
DWORD AddReverse4(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
static const BYTE AddReverse4_LUT[16][16]= // AddReverse4_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 4bits
{
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 },
{ 2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29 },
{ 4, 6, 2, 1, 12, 14, 10, 9, 20, 22, 18, 17, 28, 30, 26, 25 },
{ 6, 5, 1, 3, 14, 13, 9, 11, 22, 21, 17, 19, 30, 29, 25, 27 },
{ 8, 10, 12, 14, 4, 6, 2, 1, 24, 26, 28, 30, 20, 22, 18, 17 },
{ 10, 9, 14, 13, 6, 5, 1, 3, 26, 25, 30, 29, 22, 21, 17, 19 },
{ 12, 14, 10, 9, 2, 1, 6, 5, 28, 30, 26, 25, 18, 17, 22, 21 },
{ 14, 13, 9, 11, 1, 3, 5, 7, 30, 29, 25, 27, 17, 19, 21, 23 },
{ 16, 18, 20, 22, 24, 26, 28, 30, 8, 10, 12, 14, 4, 6, 2, 1 },
{ 18, 17, 22, 21, 26, 25, 30, 29, 10, 9, 14, 13, 6, 5, 1, 3 },
{ 20, 22, 18, 17, 28, 30, 26, 25, 12, 14, 10, 9, 2, 1, 6, 5 },
{ 22, 21, 17, 19, 30, 29, 25, 27, 14, 13, 9, 11, 1, 3, 5, 7 },
{ 24, 26, 28, 30, 20, 22, 18, 17, 4, 6, 2, 1, 12, 14, 10, 9 },
{ 26, 25, 30, 29, 22, 21, 17, 19, 6, 5, 1, 3, 14, 13, 9, 11 },
{ 28, 30, 26, 25, 18, 17, 22, 21, 2, 1, 6, 5, 10, 9, 14, 13 },
{ 30, 29, 25, 27, 17, 19, 21, 23, 1, 3, 5, 7, 9, 11, 13, 15 }
};
for (cy0=0,z=0,i=28;i>=0;i-=4,cy0=cy)
{
a=AddReverse4_LUT[(x>>i)&15][(y>>i)&15]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverse4_LUT[8][a]; cy|=a&1; a>>=1; } // adc
z|=a<<i;
}
return z;
}
//---------------------------------------------------------------------------
WORD AddReverese8_LUT[256][256]; // LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 8bits
DWORD AddReverse8(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
BYTE *pz=(BYTE*)&z;
for (cy0=0,z=0,i=3;i>=0;i--,cy0=cy)
{
a=AddReverese8_LUT[px[i]][py[i]]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverese8_LUT[128][a]; cy|=a&1; a>>=1; } // adc
pz[i]=a;
}
return z;
}
//---------------------------------------------------------------------------
DWORD AddReverseKS(DWORD x,DWORD y) // https://en.wikipedia.org/wiki/Kogge–Stone_adder
{ // Adder from harold's answer ported to 32bit
DWORD p=x^y;
DWORD g=x&y;
g|=p&(g>> 1); p&=p>> 1;
g|=p&(g>> 2); p&=p>> 2;
g|=p&(g>> 4); p&=p>> 4;
g|=p&(g>> 8); p&=p>> 8;
g|=p&(g>>16);
return x^y^(g>>1);
}
//---------------------------------------------------------------------------
The AddReverse8_LUT
is initialized like this:
for (DWORD x=0;x<256;x++)
for (DWORD y=0;y<256;y++)
AddReverse8_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1));
Here measured times on mine setup:
8M x BitReverseSlow [ 506.585 ms]
8M x BitReverse1 [ 59.769 ms]
8M x BitReverse8 [ 72.355 ms]
16M x AddReverseSlow [ 611.677 ms]
16M x AddReverse4 [ 491.546 ms]
16M x AddReverse8 [ 347.293 ms]
16M x AddReverseKS [ 142.149 ms]
As you can see the bigger LUT the better speed at the cost of space. On top of that if the resolution is 8/16/32bit you can use BYTE access by pointer instead of slow bit shift/mask speeding up even more.
Just to be sure the Addition LUT[][]
table is encoded such that the addition is shifted to left by 1 bit and the empty space is occupied by Carry flag for the next nibble ...
In the additions when I am doing the adc
(incrementing by carry flag) the operand for LUT
is 1
bit reversed in LUT bit resolution so
1000bin = 8dec // 4 bits
10000000bin = 128dec // 8 bits
To port this to 64 bit just change DWORD
to your datatype and change the bit masks and iterations count to match 64 bit ...
Anyway this reminds me at my old answer to:
However it looks like non LUT versions of BitReverse
are faster
After some digging it looks like LUT usage with temp variables force my compiler to compile my functions differently with great speed decrease (more tan twice). Once all functions are enforced to such calling api the LUT versions are sligthly faster:
8M x BitReverseSlow [ 537.534 ms]
8M x BitReverse1 [ 99.804 ms] <- this is slowed down due to changing call style to the same as the BitReverse8
8M x BitReverse8 [ 80.725 ms]
so I suspect that for reasonably simple functions my compiler uses Assembly call style (not using heap/stack for operands and returning value).
So the result is that:
using BitReverse1
is faster. Also harolds answer beats mine by far for the same reasons + it has less operations needed ...
However if you would hardwire this (FPGA or on die or using gates) I suspect the LUT versions of BitReverse
will be faster (Although you can Hardwire bitreversal directly which is unbeatbale).
Upvotes: 1
Reputation: 64904
Emulating a Kogge-Stone adder, with the shift direction reversed, gives a nice algorithm,
uint64_t p = x ^ y;
uint64_t g = x & y;
g |= p & (g >> 1);
p &= p >> 1;
g |= p & (g >> 2);
p &= p >> 2;
g |= p & (g >> 4);
p &= p >> 4;
g |= p & (g >> 8);
p &= p >> 8;
g |= p & (g >> 16);
p &= p >> 16;
g |= p & (g >> 32);
uint64_t result = x ^ y ^ (g >> 1);
Upvotes: 3
Reputation: 27567
You want to do it bit-wise and keep track of carry. Something like:
uint64_t addreverse(uint64_t a, uint64_t b) {
uint64_t current_bit = 0x8000000000000000ull;
unsigned int shift = 63;
unsigned int carry = 0;
uint64_t sum = 0;
while (current_bit) {
uint64_t bit_sum = ((a & current_bit) >> shift) +
((b & current_bit) >> shift);
sum += (((bit_sum & 1) + carry) << shift);
carry = (bit_sum & 2) >> 1;
current_bit >>= 1;
--shift;
}
return sum;
}
Upvotes: 0