Reputation: 1138
In my spare time I've been working on a utility library that among other things, supports signed/unsigned 128-bit integers. This library uses cpu-dispatching to utilize simd instructions in some cases, but requires a portable fallback so it will run everywhere else. Most recently I implemented the portable fallback for 128-bit shifting. It works correctly and runs reasonably fast, but it's not as fast as I'd like it to be, especially on 32-bit architectures.
Here's a stripped version with all relevant types and functionality(64-bit version included for completeness):
typedef uint32_t UInt32;
typedef int32_t Int32;
typedef uint64_t UInt64;
typedef int64_t Int64;
// Returns 0xFFFFFFFF if value != 0, otherwise returns 0.
UInt32 AllOrNothingMask32(Int32 value)
{
return UInt32(-Int32(value != 0));
}
struct alignas(16) UInt128
{
// Ensure the layout matches the architecture.
// LE = little endian
// BE = big endian
#if CPU_TYPE == CPU_LE32
UInt32 mLow;
UInt32 mLowMid;
UInt32 mHighMid;
UInt32 mHigh;
#elif CPU_TYPE == CPU_BE32
UInt32 mHigh;
UInt32 mHighMid;
UInt32 mLowMid;
UInt32 mLow;
#elif CPU_TYPE == CPU_LE64
UInt64 mLow;
UInt64 mHigh;
#elif CPU_TYPE == CPU_BE64
UInt64 mHigh;
UInt64 mLow;
#endif
UInt128() = default;
UInt128& operator=(const UInt128& other) = default;
inline
UInt128(UInt32 high, UInt32 highMid, UInt32 lowMid, UInt32 low) :
#if CPU_SIZE == CPU_32BIT
mLow(low),
mLowMid(lowMid),
mHighMid(highMid),
mHigh(high) { }
#elif CPU_SIZE == CPU_64BIT
mLow((UInt64(lowMid) << 32) | low),
mHigh((UInt64(high) << 32) | highMid) { }
#endif
inline
UInt128(UInt64 high, UInt64 low) :
#if CPU_SIZE == CPU_32BIT
mLow(UInt32(low)),
mLowMid(UInt32(low >> 32)),
mHighMid(UInt32(high)),
mHigh(UInt32(high >> 32)) { }
#elif CPU_SIZE == CPU_64BIT
mLow(low),
mHigh(high) { }
#endif
inline
bool UInt128::operator==(const UInt128& other) const noexcept
{
#if CPU_TYPE == CPU_32BIT
return mLow == other.mLow &&
mLowMid == other.mLowMid &&
mHighMid == other.mHighMid &&
mHigh == other.mHigh;
#elif CPU_TYPE == CPU_64BIT
return mLow == other.mLow &&
mHigh == other.mHigh;
#endif
}
inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
// Shift is modulo 128, effectively clamping it between 0-127.
shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
auto low = mLow;
auto lowMid = mLowMid;
auto highMid = mHighMid;
auto high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 32) {
auto rshift = 32 - shift;
mLow = (low << shift);
mLowMid = (lowMid << shift) | (low >> rshift);
mHighMid = (highMid << shift) | (lowMid >> rshift);
mHigh = (high << shift) | (highMid >> rshift);
} else if (shift < 64) {
auto lshift = (shift - 32);
auto rshift = (32 - lshift) & 0x1F;
auto rshiftMask = AllOrNothingMask32(rshift);
mLow = 0;
mLowMid = (low << lshift);
mHighMid = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
mHigh = (highMid << lshift) | ((lowMid >> rshift) & rshiftMask);
} else if (shift < 96) {
auto lshift = (shift - 64);
auto rshift = (64 - lshift) & 0x1F;
auto rshiftMask = AllOrNothingMask32(rshift);
mLow = 0;
mLowMid = 0;
mHighMid = (low << lshift);
mHigh = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
} else {
mLow = 0;
mLowMid = 0;
mHighMid = 0;
mHigh = (low << (shift - 96));
}
#elif CPU_SIZE == CPU_64BIT
auto low = mLow,
high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 64) {
mLow = (low << shift);
mHigh = (high << shift) | (low >> (64 - shift));
} else {
mLow = 0;
mHigh = (low << (shift - 64));
}
#endif
return *this;
}
inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
// Shift is modulo 128, effectively clamping it between 0-127.
shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
auto low = mLow,
lowMid = mLowMid,
highMid = mHighMid,
high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 32) {
auto rshift = 32 - shift;
mLow = (low >> shift) | (lowMid << rshift);
mLowMid = (lowMid >> shift) | (highMid << rshift);
mHighMid = (highMid >> shift) | (high << rshift);
mHigh = (high >> shift);
} else if (shift < 64) {
auto rshift = (shift - 32);
auto lshift = (32 - rshift) & 0x1F;
auto lshiftMask = AllOrNothingMask32(lshift);
mLow = (lowMid >> rshift) | ((highMid << lshift) & lshiftMask);
mLowMid = (highMid >> rshift) | ((high << lshift) & lshiftMask);
mHighMid = (high >> rshift);
mHigh = 0;
} else if (shift < 96) {
auto rshift = (shift - 64);
auto lshift = (64 - rshift) & 0x1F;
auto lshiftMask = AllOrNothingMask32(lshift);
mLow = (highMid >> rshift) | ((high << lshift) & lshiftMask);
mLowMid = (high >> rshift);
mHighMid = 0;
mHigh = 0;
} else {
mLow = (high >> (shift - 96));
mLowMid = 0;
mHighMid = 0;
mHigh = 0;
}
#elif CPU_SIZE == CPU_64BIT
auto low = mLow,
high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 64) {
mLow = (low >> shift) | (high << (64 - shift));
mHigh = (high >> shift);
} else {
mLow = (high >> (shift - 64));
mHigh = 0;
}
#endif
return *this;
}
};
The relevant 32-bit assembly output is rather long, so unless requested I'm omitting it.
The main bottleneck is obviously branching when the shift argument isn't known at compile time. What can be done to eliminate branching, or for that matter what portable trickery can be utilized to speed this up?
Update 1
Added the copy assignment operator that was missing in the above example. For those interested, here are the unit tests. I'm using Catch for it's simplicity.
// Left shift lookup table, from 0-127.
const UInt128 gLeftShiftLut128[] = {
UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
UInt128(0xFFDDBB9977553310, 0xEECCAA8866442200),
UInt128(0xFFBB7732EEAA6621, 0xDD995510CC884400),
UInt128(0xFF76EE65DD54CC43, 0xBB32AA2199108800),
UInt128(0xFEEDDCCBBAA99887, 0x7665544332211000),
UInt128(0xFDDBB9977553310E, 0xECCAA88664422000),
UInt128(0xFBB7732EEAA6621D, 0xD995510CC8844000),
UInt128(0xF76EE65DD54CC43B, 0xB32AA21991088000),
UInt128(0xEEDDCCBBAA998877, 0x6655443322110000),
UInt128(0xDDBB9977553310EE, 0xCCAA886644220000),
UInt128(0xBB7732EEAA6621DD, 0x995510CC88440000),
UInt128(0x76EE65DD54CC43BB, 0x32AA219910880000),
UInt128(0xEDDCCBBAA9988776, 0x6554433221100000),
UInt128(0xDBB9977553310EEC, 0xCAA8866442200000),
UInt128(0xB7732EEAA6621DD9, 0x95510CC884400000),
UInt128(0x6EE65DD54CC43BB3, 0x2AA2199108800000),
UInt128(0xDDCCBBAA99887766, 0x5544332211000000),
UInt128(0xBB9977553310EECC, 0xAA88664422000000),
UInt128(0x7732EEAA6621DD99, 0x5510CC8844000000),
UInt128(0xEE65DD54CC43BB32, 0xAA21991088000000),
UInt128(0xDCCBBAA998877665, 0x5443322110000000),
UInt128(0xB9977553310EECCA, 0xA886644220000000),
UInt128(0x732EEAA6621DD995, 0x510CC88440000000),
UInt128(0xE65DD54CC43BB32A, 0xA219910880000000),
UInt128(0xCCBBAA9988776655, 0x4433221100000000),
UInt128(0x9977553310EECCAA, 0x8866442200000000),
UInt128(0x32EEAA6621DD9955, 0x10CC884400000000),
UInt128(0x65DD54CC43BB32AA, 0x2199108800000000),
UInt128(0xCBBAA99887766554, 0x4332211000000000),
UInt128(0x977553310EECCAA8, 0x8664422000000000),
UInt128(0x2EEAA6621DD99551, 0xCC8844000000000),
UInt128(0x5DD54CC43BB32AA2, 0x1991088000000000),
UInt128(0xBBAA998877665544, 0x3322110000000000),
UInt128(0x77553310EECCAA88, 0x6644220000000000),
UInt128(0xEEAA6621DD995510, 0xCC88440000000000),
UInt128(0xDD54CC43BB32AA21, 0x9910880000000000),
UInt128(0xBAA9988776655443, 0x3221100000000000),
UInt128(0x7553310EECCAA886, 0x6442200000000000),
UInt128(0xEAA6621DD995510C, 0xC884400000000000),
UInt128(0xD54CC43BB32AA219, 0x9108800000000000),
UInt128(0xAA99887766554433, 0x2211000000000000),
UInt128(0x553310EECCAA8866, 0x4422000000000000),
UInt128(0xAA6621DD995510CC, 0x8844000000000000),
UInt128(0x54CC43BB32AA2199, 0x1088000000000000),
UInt128(0xA998877665544332, 0x2110000000000000),
UInt128(0x53310EECCAA88664, 0x4220000000000000),
UInt128(0xA6621DD995510CC8, 0x8440000000000000),
UInt128(0x4CC43BB32AA21991, 0x880000000000000),
UInt128(0x9988776655443322, 0x1100000000000000),
UInt128(0x3310EECCAA886644, 0x2200000000000000),
UInt128(0x6621DD995510CC88, 0x4400000000000000),
UInt128(0xCC43BB32AA219910, 0x8800000000000000),
UInt128(0x9887766554433221, 0x1000000000000000),
UInt128(0x310EECCAA8866442, 0x2000000000000000),
UInt128(0x621DD995510CC884, 0x4000000000000000),
UInt128(0xC43BB32AA2199108, 0x8000000000000000),
UInt128(0x8877665544332211, 0x0),
UInt128(0x10EECCAA88664422, 0x0),
UInt128(0x21DD995510CC8844, 0x0),
UInt128(0x43BB32AA21991088, 0x0),
UInt128(0x8776655443322110, 0x0),
UInt128(0xEECCAA886644220 , 0x0),
UInt128(0x1DD995510CC88440, 0x0),
UInt128(0x3BB32AA219910880, 0x0),
UInt128(0x7766554433221100, 0x0),
UInt128(0xEECCAA8866442200, 0x0),
UInt128(0xDD995510CC884400, 0x0),
UInt128(0xBB32AA2199108800, 0x0),
UInt128(0x7665544332211000, 0x0),
UInt128(0xECCAA88664422000, 0x0),
UInt128(0xD995510CC8844000, 0x0),
UInt128(0xB32AA21991088000, 0x0),
UInt128(0x6655443322110000, 0x0),
UInt128(0xCCAA886644220000, 0x0),
UInt128(0x995510CC88440000, 0x0),
UInt128(0x32AA219910880000, 0x0),
UInt128(0x6554433221100000, 0x0),
UInt128(0xCAA8866442200000, 0x0),
UInt128(0x95510CC884400000, 0x0),
UInt128(0x2AA2199108800000, 0x0),
UInt128(0x5544332211000000, 0x0),
UInt128(0xAA88664422000000, 0x0),
UInt128(0x5510CC8844000000, 0x0),
UInt128(0xAA21991088000000, 0x0),
UInt128(0x5443322110000000, 0x0),
UInt128(0xA886644220000000, 0x0),
UInt128(0x510CC88440000000, 0x0),
UInt128(0xA219910880000000, 0x0),
UInt128(0x4433221100000000, 0x0),
UInt128(0x8866442200000000, 0x0),
UInt128(0x10CC884400000000, 0x0),
UInt128(0x2199108800000000, 0x0),
UInt128(0x4332211000000000, 0x0),
UInt128(0x8664422000000000, 0x0),
UInt128(0xCC8844000000000 , 0x0),
UInt128(0x1991088000000000, 0x0),
UInt128(0x3322110000000000, 0x0),
UInt128(0x6644220000000000, 0x0),
UInt128(0xCC88440000000000, 0x0),
UInt128(0x9910880000000000, 0x0),
UInt128(0x3221100000000000, 0x0),
UInt128(0x6442200000000000, 0x0),
UInt128(0xC884400000000000, 0x0),
UInt128(0x9108800000000000, 0x0),
UInt128(0x2211000000000000, 0x0),
UInt128(0x4422000000000000, 0x0),
UInt128(0x8844000000000000, 0x0),
UInt128(0x1088000000000000, 0x0),
UInt128(0x2110000000000000, 0x0),
UInt128(0x4220000000000000, 0x0),
UInt128(0x8440000000000000, 0x0),
UInt128(0x880000000000000 , 0x0),
UInt128(0x1100000000000000, 0x0),
UInt128(0x2200000000000000, 0x0),
UInt128(0x4400000000000000, 0x0),
UInt128(0x8800000000000000, 0x0),
UInt128(0x1000000000000000, 0x0),
UInt128(0x2000000000000000, 0x0),
UInt128(0x4000000000000000, 0x0),
UInt128(0x8000000000000000, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0)
};
// Right shift lookup table, from 0-127.
const UInt128 gRightShiftLut128[] = {
UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
UInt128(0x7FF76EE65DD54CC4, 0x3BB32AA219910880),
UInt128(0x3FFBB7732EEAA662, 0x1DD995510CC88440),
UInt128(0x1FFDDBB997755331, 0xEECCAA886644220),
UInt128(0xFFEEDDCCBBAA998, 0x8776655443322110),
UInt128(0x7FF76EE65DD54CC, 0x43BB32AA21991088),
UInt128(0x3FFBB7732EEAA66, 0x21DD995510CC8844),
UInt128(0x1FFDDBB99775533, 0x10EECCAA88664422),
UInt128(0xFFEEDDCCBBAA99, 0x8877665544332211),
UInt128(0x7FF76EE65DD54C, 0xC43BB32AA2199108),
UInt128(0x3FFBB7732EEAA6, 0x621DD995510CC884),
UInt128(0x1FFDDBB9977553, 0x310EECCAA8866442),
UInt128(0xFFEEDDCCBBAA9, 0x9887766554433221),
UInt128(0x7FF76EE65DD54, 0xCC43BB32AA219910),
UInt128(0x3FFBB7732EEAA, 0x6621DD995510CC88),
UInt128(0x1FFDDBB997755, 0x3310EECCAA886644),
UInt128(0xFFEEDDCCBBAA, 0x9988776655443322),
UInt128(0x7FF76EE65DD5, 0x4CC43BB32AA21991),
UInt128(0x3FFBB7732EEA, 0xA6621DD995510CC8),
UInt128(0x1FFDDBB99775, 0x53310EECCAA88664),
UInt128(0xFFEEDDCCBBA, 0xA998877665544332),
UInt128(0x7FF76EE65DD, 0x54CC43BB32AA2199),
UInt128(0x3FFBB7732EE, 0xAA6621DD995510CC),
UInt128(0x1FFDDBB9977, 0x553310EECCAA8866),
UInt128(0xFFEEDDCCBB, 0xAA99887766554433),
UInt128(0x7FF76EE65D, 0xD54CC43BB32AA219),
UInt128(0x3FFBB7732E, 0xEAA6621DD995510C),
UInt128(0x1FFDDBB997, 0x7553310EECCAA886),
UInt128(0xFFEEDDCCB, 0xBAA9988776655443),
UInt128(0x7FF76EE65, 0xDD54CC43BB32AA21),
UInt128(0x3FFBB7732, 0xEEAA6621DD995510),
UInt128(0x1FFDDBB99, 0x77553310EECCAA88),
UInt128(0xFFEEDDCC, 0xBBAA998877665544),
UInt128(0x7FF76EE6, 0x5DD54CC43BB32AA2),
UInt128(0x3FFBB773, 0x2EEAA6621DD99551),
UInt128(0x1FFDDBB9, 0x977553310EECCAA8),
UInt128(0xFFEEDDC, 0xCBBAA99887766554),
UInt128(0x7FF76EE, 0x65DD54CC43BB32AA),
UInt128(0x3FFBB77, 0x32EEAA6621DD9955),
UInt128(0x1FFDDBB, 0x9977553310EECCAA),
UInt128(0xFFEEDD, 0xCCBBAA9988776655),
UInt128(0x7FF76E, 0xE65DD54CC43BB32A),
UInt128(0x3FFBB7, 0x732EEAA6621DD995),
UInt128(0x1FFDDB, 0xB9977553310EECCA),
UInt128(0xFFEED, 0xDCCBBAA998877665),
UInt128(0x7FF76, 0xEE65DD54CC43BB32),
UInt128(0x3FFBB, 0x7732EEAA6621DD99),
UInt128(0x1FFDD, 0xBB9977553310EECC),
UInt128(0xFFEE, 0xDDCCBBAA99887766),
UInt128(0x7FF7, 0x6EE65DD54CC43BB3),
UInt128(0x3FFB, 0xB7732EEAA6621DD9),
UInt128(0x1FFD, 0xDBB9977553310EEC),
UInt128(0xFFE, 0xEDDCCBBAA9988776),
UInt128(0x7FF, 0x76EE65DD54CC43BB),
UInt128(0x3FF, 0xBB7732EEAA6621DD),
UInt128(0x1FF, 0xDDBB9977553310EE),
UInt128(0xFF, 0xEEDDCCBBAA998877),
UInt128(0x7F, 0xF76EE65DD54CC43B),
UInt128(0x3F, 0xFBB7732EEAA6621D),
UInt128(0x1F, 0xFDDBB9977553310E),
UInt128(0xF, 0xFEEDDCCBBAA99887),
UInt128(0x7, 0xFF76EE65DD54CC43),
UInt128(0x3, 0xFFBB7732EEAA6621),
UInt128(0x1, 0xFFDDBB9977553310),
UInt128(0x0, 0xFFEEDDCCBBAA9988),
UInt128(0x0, 0x7FF76EE65DD54CC4),
UInt128(0x0, 0x3FFBB7732EEAA662),
UInt128(0x0, 0x1FFDDBB997755331),
UInt128(0x0, 0xFFEEDDCCBBAA998),
UInt128(0x0, 0x7FF76EE65DD54CC),
UInt128(0x0, 0x3FFBB7732EEAA66),
UInt128(0x0, 0x1FFDDBB99775533),
UInt128(0x0, 0xFFEEDDCCBBAA99),
UInt128(0x0, 0x7FF76EE65DD54C),
UInt128(0x0, 0x3FFBB7732EEAA6),
UInt128(0x0, 0x1FFDDBB9977553),
UInt128(0x0, 0xFFEEDDCCBBAA9),
UInt128(0x0, 0x7FF76EE65DD54),
UInt128(0x0, 0x3FFBB7732EEAA),
UInt128(0x0, 0x1FFDDBB997755),
UInt128(0x0, 0xFFEEDDCCBBAA),
UInt128(0x0, 0x7FF76EE65DD5),
UInt128(0x0, 0x3FFBB7732EEA),
UInt128(0x0, 0x1FFDDBB99775),
UInt128(0x0, 0xFFEEDDCCBBA),
UInt128(0x0, 0x7FF76EE65DD),
UInt128(0x0, 0x3FFBB7732EE),
UInt128(0x0, 0x1FFDDBB9977),
UInt128(0x0, 0xFFEEDDCCBB),
UInt128(0x0, 0x7FF76EE65D),
UInt128(0x0, 0x3FFBB7732E),
UInt128(0x0, 0x1FFDDBB997),
UInt128(0x0, 0xFFEEDDCCB),
UInt128(0x0, 0x7FF76EE65),
UInt128(0x0, 0x3FFBB7732),
UInt128(0x0, 0x1FFDDBB99),
UInt128(0x0, 0xFFEEDDCC),
UInt128(0x0, 0x7FF76EE6),
UInt128(0x0, 0x3FFBB773),
UInt128(0x0, 0x1FFDDBB9),
UInt128(0x0, 0xFFEEDDC),
UInt128(0x0, 0x7FF76EE),
UInt128(0x0, 0x3FFBB77),
UInt128(0x0, 0x1FFDDBB),
UInt128(0x0, 0xFFEEDD),
UInt128(0x0, 0x7FF76E),
UInt128(0x0, 0x3FFBB7),
UInt128(0x0, 0x1FFDDB),
UInt128(0x0, 0xFFEED),
UInt128(0x0, 0x7FF76),
UInt128(0x0, 0x3FFBB),
UInt128(0x0, 0x1FFDD),
UInt128(0x0, 0xFFEE),
UInt128(0x0, 0x7FF7),
UInt128(0x0, 0x3FFB),
UInt128(0x0, 0x1FFD),
UInt128(0x0, 0xFFE),
UInt128(0x0, 0x7FF),
UInt128(0x0, 0x3FF),
UInt128(0x0, 0x1FF),
UInt128(0x0, 0xFF),
UInt128(0x0, 0x7F),
UInt128(0x0, 0x3F),
UInt128(0x0, 0x1F),
UInt128(0x0, 0xF),
UInt128(0x0, 0x7),
UInt128(0x0, 0x3),
UInt128(0x0, 0x1)
};
TEST_CASE("UInt128 left shift produces correct results.") {
auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);
for (auto i = 1; i <= 127; i++) {
auto sample = base;
sample <<= i;
INFO("i = " << i);
REQUIRE(sample == gLeftShiftLut128[i]);
}
}
TEST_CASE("UInt128 right shift produces correct results.") {
auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);
for (auto i = 0; i <= 127; i++) {
auto sample = base;
sample >>= i;
INFO("i = " << i);
REQUIRE(sample == gRightShiftLut128[i]);
}
}
Upvotes: 5
Views: 804
Reputation: 22378
I would think a UInt128
would be better implemented using an array, where the endianness is not an issue, e.g.,
alignas (16) uint32_t data[4];
or: alignas (16) uint64_t data[2];
Note that alignment will not be guaranteed for objects created on the heap; though some ABIs do have 16 byte minimum alignment. You can check with alignof(std::max_align_t)
. If not, you will need to replace the global operator new and delete functions for SIMD (e.g., SSE).
For uint32_t
implementation, you split the shift into 'word' and 'bit' shifts - also, it doesn't make sense to have a signed Int32 as a shift count...
inline UInt128 &
UInt128::operator <<= (uint32_t shift) noexcept
{
shift &= 0x7f;
auto shw = shift / (32); // or (shift >> 5)
auto shl = shift % (32); // or (shift & 1f)
// branch-free shift masking:
uint32_t shm = shl - 1;
uint32_t shr = (- shl) & (32 - 1);
shm = (shm >> (32 - 1)) - 1; // 0xffffffff or 0x0
switch (shw)
{
case (3) :
data[3] = (data[0] << shl);
data[2] = 0, data[1] = 0, data[0] = 0;
break;
case (2) :
data[3] = (data[1] << shl) | ((data[0] >> shr) & shm);
data[2] = (data[0] << shl);
data[1] = 0, data[0] = 0;
break;
case (1) :
data[3] = (data[2] << shl) | ((data[1] >> shr) & shm);
data[2] = (data[1] << shl) | ((data[0] >> shr) & shm);
data[1] = (data[0] << shl);
data[0] = 0;
break;
case (0) : // default:
data[3] = (data[3] << shl) | ((data[2] >> shr) & shm);
data[2] = (data[2] << shl) | ((data[1] >> shr) & shm);
data[1] = (data[1] << shl) | ((data[0] >> shr) & shm);
data[0] = (data[0] << shl);
// break;
}
return *this;
}
I'm pretty sure I have the data indices correct here. If shift
is a compile time constant, the compiler should be able to optimize this code pretty aggressively.
I'll leave right shift to you, except you will have to update data
from low to high words, so as to not overwrite the words before they are read. Otherwise, it should mostly be a simple exercise in swapping the roles of shl
and shr
. A uint64_t
data version should be pretty straightforward.
Upvotes: 1
Reputation: 204994
I haven't benchmarked it, but something like this is branchless:
inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
auto lshift = shift & 31;
auto rshift = 31 - lshift;
UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
0, 0, 0, 0,
mLow << lshift,
mLowMid << lshift | mLow >> 1 >> rshift,
mHighMid << lshift | mLowMid >> 1 >> rshift,
mHigh << lshift | mHighMid >> 1 >> rshift
#elif CPU_TYPE == CPU_BE32
mHigh << lshift | mHighMid >> 1 >> rshift,
mHighMid << lshift | mLowMid >> 1 >> rshift,
mLowMid << lshift | mLow >> 1 >> rshift,
mLow << lshift,
0, 0, 0, 0
#endif
};
memcpy(this, &parts[
#if CPU_TYPE == CPU_LE32
4 -
#endif
(shift >> 5 & 3)], 16);
return *this;
}
inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
auto rshift = shift & 31;
auto lshift = 31 - rshift;
UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
mLow >> rshift | mMidLow << lshift << 1,
mMidLow >> rshift | mMidHigh << lshift << 1,
mMidHigh >> rshift | mHigh << lshift << 1,
mHigh >> rshift,
0, 0, 0, 0
#elif CPU_TYPE == CPU_BE32
0, 0, 0, 0,
mHigh >> rshift,
mMidHigh >> rshift | mHigh << lshift << 1,
mMidLow >> rshift | mMidHigh << lshift << 1,
mLow >> rshift | mMidLow << lshift << 1
#endif
};
memcpy(this, &parts[
#if CPU_TYPE == CPU_BE32
4 -
#endif
(shift >> 5 & 3)], 16);
return *this;
}
Upvotes: 1