Reputation: 581
I don't really know if the origin of the problem has actually something to do with POD types, but at least I can see the difference there. What I would like to achieve is that my template Pod
class behaves like a simple builtin type (e.g. float) in regards of performance.
So what I currently have is a templated class with a tag and value type:
template<typename TTag, typename TValue>
class Pod
{
public:
Pod() = default;
Pod( TValue value ) : m_value{ value } {};
TValue& value() noexcept
{
return m_value;
}
TValue const& value() const noexcept
{
return m_value;
}
Pod& operator+=( Pod const& src ) noexcept
{
m_value += src.m_value;
return *this;
}
Pod& operator*=( TValue const& src ) noexcept
{
m_value *= src;
return *this;
}
private:
TValue m_value{};
};
template<typename TTag, typename TValue>
Pod<TTag, TValue>
operator+( Pod<TTag, TValue> lhs, Pod<TTag, TValue> const& rhs ) noexcept
{
lhs += rhs;
return lhs;
}
template<typename TTag, typename TValue>
Pod<TTag, TValue> operator*( Pod<TTag, TValue> lhs, TValue const& rhs ) noexcept
{
lhs *= rhs;
return lhs;
}
Now lets take a simple test operation like: (A)
std::vector<Pod<A_tag, float>> lhs( size );
std::vector<Pod<A_tag, float>> rhs( size );
std::vector<Pod<A_tag, float>> res( size );
for ( std::size_t i = 0; i < size; ++i )
{
res[ i ] = lhs[ i ] + rhs[ i ];
}
and compare it to: (B)
std::vector<float> lhs( size );
std::vector<float> rhs( size );
std::vector<float> res( size );
for ( std::size_t i = 0; i < size; ++i )
{
res[ i ] = lhs[ i ] + rhs[ i ];
}
What I noticed with my googlebenchmark
test cases, is that (B) is around 3 times faster than (A).
If I change (A) to: (C)
std::vector<Pod<A_tag, float>> lhs( size );
std::vector<Pod<A_tag, float>> rhs( size );
std::vector<Pod<A_tag, float>> res( size );
auto plhs = &( lhs[ 0 ].value() );
auto prhs = &( rhs[ 0 ].value() );
for ( std::size_t i = 0; i < size; ++i )
{
res[ i ].value() = plhs[ i ] + prhs[ i ];
}
I can achieve similar performance for (C) like (B).
There are now two question that came up to my mind:
Pod
that it behaves similar to the underlying value type in regards of performance.I can use C++17, if that helps. I tested everything with Visual Studio 2019 Release mode /O2.
Thanks for help!
Upvotes: 3
Views: 96
Reputation: 85462
Is variant (C) legal regarding standard or is it actually UB or something similar.
Variant (C) is technically UB - a strict aliasing violation for any i
>= 1. You're iterating over a pointer to float
residing inside Pod
as-if it was in an array by itself. But Pod<...>
and float
are incompatible types.
In addition it's not portable - it may be OK in VC++, but technically there can be padding at the end of Pod
, so it's not guaranteed that the float
s will be arranged with no gaps between them.
Is there any way to define the template class
Pod
that it behaves similar to the underlying value type in regards of performance.
Yes, you can disable auto-vectorization in the first version, then all versions will perform similarly :)
#pragma loop( no_vector )
Basically MSVC is able to auto-vectorize only very simple loops currently. We can see here that it auto-vectorized the first version (if we can call it that):
$LL25@Baseline:
movss xmm0, DWORD PTR [rax+r9*4]
addss xmm0, DWORD PTR [r10+r9*4]
movss DWORD PTR [rdx+r9*4], xmm0
movss xmm1, DWORD PTR [r10+r9*4+4]
addss xmm1, DWORD PTR [rax+r9*4+4]
movss DWORD PTR [rdx+r9*4+4], xmm1
movss xmm0, DWORD PTR [rax+r9*4+8]
addss xmm0, DWORD PTR [r10+r9*4+8]
movss DWORD PTR [rdx+r9*4+8], xmm0
movss xmm1, DWORD PTR [rax+r9*4+12]
addss xmm1, DWORD PTR [r10+r9*4+12]
movss DWORD PTR [rdx+r9*4+12], xmm1
add r9, 4
cmp r9, 9997 ; 0000270dH
jb SHORT $LL25@Baseline
cmp r9, 10000 ; 00002710H
jae $LN23@Baseline
But is just going nuts around the Pod
version, unfortunately:
$LL4@PodVec:
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax-12]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax-12]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax-12], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax-8]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax-8]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax-8], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax-4]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax-4]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax-4], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax+4]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax+4]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax+4], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax+8]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax+8]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax+8], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax+12]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax+12]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax+12], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax+16]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax+16]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax+16], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax+20]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax+20]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax+20], xmm0
mov rax, QWORD PTR [rdx]
movss xmm0, DWORD PTR [r9+rax+24]
mov rax, QWORD PTR [rcx]
addss xmm0, DWORD PTR [r9+rax+24]
mov rax, QWORD PTR [r8]
movss DWORD PTR [r9+rax+24], xmm0
add r9, 40 ; 00000028H
cmp r9, 40012 ; 00009c4cH
jb $LL4@PodVec
I tried to help it out in various legal ways by simplifying the code to make it maximally easy to optimize:
#include <vector>
#include <benchmark/benchmark.h>
template<typename TValue>
struct Pod {
public:
Pod() noexcept {}
Pod(TValue value) noexcept : m_value{ value } {}
Pod operator+ (Pod src) const noexcept {
return m_value + src.m_value;
}
private:
TValue m_value{};
};
constexpr std::size_t size = 10000;
inline void Baseline(std::vector<float>& lhs, std::vector<float>& rhs, std::vector<float>& res) {
for (std::size_t i = 0; i < size; ++i)
{
res[i] = lhs[i] + rhs[i];
}
}
inline void PodVec(std::vector<Pod<float>>& lhs, std::vector<Pod<float>>& rhs, std::vector<Pod<float>>& res) {
for (std::size_t i = 0; i < size; ++i)
{
res[i] = lhs[i] + rhs[i];
}
}
inline void PodPtr(Pod<float>* lhs, Pod<float>* rhs, Pod<float>* res) {
for (std::size_t i = 0; i < size; ++i)
{
res[i] = lhs[i] + rhs[i];
}
}
using Iter = std::vector<Pod<float>>::iterator;
inline void PodIter(Iter lhs, Iter rhs, Iter res, Iter endres) {
for (; res != endres; lhs++, rhs++, res++)
{
*res = *lhs + *rhs;
}
}
void BaselineTest(benchmark::State& state) {
std::vector<float> lhs(size), rhs(size), res(size);
for (auto _ : state) {
Baseline(lhs, rhs, res);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(BaselineTest);
void PodVecTest(benchmark::State& state) {
std::vector<Pod<float>> lhs(size), rhs(size), res(size);
for (auto _ : state) {
PodVec(lhs, rhs, res);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(PodVecTest);
void PodPtrTest(benchmark::State& state) {
std::vector<Pod<float>> lhs(size), rhs(size), res(size);
for (auto _ : state) {
PodPtr(&lhs[0], &rhs[0], &res[0]);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(PodPtrTest);
void PodIterTest(benchmark::State& state) {
std::vector<Pod<float>> lhs(size), rhs(size), res(size);
for (auto _ : state) {
PodIter(begin(lhs), begin(rhs), begin(res), end(res));
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(PodIterTest);
But no luck, unfortunately (curiously though, every version performs differently):
-------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------
BaselineTest 1824 ns 1842 ns 373333
PodVecTest 6166 ns 6278 ns 112000
PodPtrTest 5185 ns 5162 ns 112000
PodIterTest 4663 ns 4604 ns 149333
At the same time, GCC 10 has no problem with either version:
----------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------
BaselineTest 1469 ns 1469 ns 422409
PodVecTest 1464 ns 1464 ns 484971
PodPtrTest 1424 ns 1424 ns 491200
PodIterTest 1420 ns 1420 ns 495973
And this is how the GCC loop assembly looks like:
.L6:
movss xmm0, DWORD PTR [rcx+rax*4]
addss xmm0, DWORD PTR [rsi+rax*4]
movss DWORD PTR [rdx+rax*4], xmm0
add rax, 1
cmp rax, 10000
jne .L6
Interestingly, Clang 11 is having a bit of difficulty with the vector<Pod>
version, too (link), which was unexpected.
The moral of the story: there are no zero-cost abstractions. In this example if you want to benefit from vectorization, I see no way other than to use "raw" vector<float>
s (or switch to GCC).
Upvotes: 1