Reputation: 433
I have an __n128 that I want to use as input for the vtbl2_u8 intrinsic, and it doesn't like it. vreinterpret doesn't seem to have to have a variant that works on __n128 so far as I can tell, and these things seem to be finicky about reinterpret_cast. I'm much more used to SSE2 so... any guidance for an ARM NEON noob?
Edit:
More specifically, could I get an idea as to why:
static __forceinline __n128 byteshuffle(
_In_ const __n128& x,
_In_ const __n128& mask)
{
uint8x8x2_t in =
{
x.n128_u64[0],
x.n128_u64[1]
};
__n128 out;
out.n128_u64[0] = vtbl2_u8(in, mask.n128_u64[0]);
out.n128_u64[1] = vtbl2_u8(in, mask.n128_u64[1]);
return out;
}
doesn't compile? Error is 'no suitable constructor exists to convert "const unsigned long long" to "__n64"' on both vtbl lines.
Upvotes: 1
Views: 713
Reputation: 41077
The vreinterpret_X_Y
macro is used for taking an existing register and 'casting' the type to some other form to pass along to another intrinsic. For example, this code loads up two 16-bit signed shorts in a single load as a 32-bit unsigned int, but then I have to use vreinterpret_s16_u32
because I don't actually want to treat the data as a uint32x2_t
and instead I want it to be int16x4_t
which is exactly the same size in bytes (i.e. they both map to __n64
values).
// ptr is an input pointer to two uint16_t values
uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>(ptr) );
int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
Note: vreinterpret_X_Y
does exactly what _mm_castX_Y
does for SSE. I.e., nothing. It doesn't emit any code, it just makes the compiler happier about the type changes. It's worth noting that Visual Studio's ARM C++ compiler doesn't really do much type checking in this regard since everything is really treated as a __n64
or __n128
type anyhow. As such, vreinterpret_X_Y
is mostly a matter of code portability.
The table look-up intrinsics, however, are a bit of a special case. You have to load up the uint8x8x2_t
type and can't just cast an existing variable to it.
Note: This also applies to vtbxl
, vtrn
, vzip
, vuzp
, vld2+
, and vst2+
intrinsics.
For example, in DirectXMath I implemented the ARM-NEON version of the general XMVectorSwizzle
using two vtbl2_u8
lookups:
// DirectXMathVector.inl
inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V,
uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3)
{
assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
static const uint32_t ControlElement[ 4 ] =
{
0x03020100, // XM_SWIZZLE_X
0x07060504, // XM_SWIZZLE_Y
0x0B0A0908, // XM_SWIZZLE_Z
0x0F0E0D0C, // XM_SWIZZLE_W
};
int8x8x2_t tbl;
tbl.val[0] = vget_low_f32(V);
tbl.val[1] = vget_high_f32(V);
uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0])
| (((uint64_t)ControlElement[E1]) << 32) );
const uint8x8_t rL = vtbl2_u8( tbl, idx );
idx = vcreate_u32( ((uint64_t)ControlElement[E2])
| (((uint64_t)ControlElement[E3]) << 32) );
const uint8x8_t rH = vtbl2_u8( tbl, idx );
return vcombine_f32( rL, rH );
}
Similarly, I used vtbl4_u8
for XMVectorPermute
Note that while vtbl
is pretty powerful, it's a bit complicated to use. For 'common' swizzle patterns, I implemented a template form of both XMVectorSwizzle
and XMVectorPermute
so I could specialize cases that didn't require the full table lookup:
// General swizzle template
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline XMVECTOR XMVectorSwizzle(FXMVECTOR V)
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
}
// Specialized swizzles
template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V)
{ return V; }
template<> inline XMVECTORXMVectorSwizzle<0,0,0,0>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<1,1,1,1>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,2,2,2>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<3,3,3,3>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,3,2>(FXMVECTOR V)
{ return vrev64q_f32(V); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,0,1>(FXMVECTOR V)
{ float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,2,3>(FXMVECTOR V)
{ float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,1,0>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,3,2>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,3,2>(FXMVECTOR V)
{ return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,2,3>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,1,0>(FXMVECTOR V)
{ return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,0,1>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,1,0>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<0,0,2,2>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,1,3,3>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,0,1,1>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<2,2,3,3>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,2,0,2>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,3,1,3>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V)
{ return vextq_f32(V, V, 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V)
{ return vextq_f32(V, V, 2); }
template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V)
{ return vextq_f32(V, V, 3); }
Upvotes: 1