Reputation: 153
I wanted to write a dynamic array for trivial type (then I can use memcpy or sth to optimize), but when I compare its efficiency to std::vector
, I found its push_back
function is twice as efficient as astd::vector
.
That's so strange, I read source code of MSVC STL to find why, but in vain.
my code:
template<typename T>
class Array {
static_assert((std::is_trivial_v<T>), "Array requires type to be trivial.");
T* m_p;
size_t m_cap, m_siz;
private:
void increase(size_t new_cap) {
ASSERT(new_cap > m_cap);
if (new_cap < m_cap + m_cap / 2)new_cap = m_cap + m_cap / 2;
T* new_p = (T*)realloc(m_p, sizeof(T) * new_cap);
ASSERT(new_p);
m_p = new_p;
m_cap = new_cap;
}
public:
Array() : m_siz(0), m_cap(0), m_p(nullptr) {}
~Array() { if (m_p)free(m_p); }
Array(const Array& x) {
m_cap = m_siz = x.m_siz;
m_p = (T*)malloc(sizeof(T) * m_siz);
memcpy(m_p, x.m_p, sizeof(T) * m_siz);
}
Array(Array&& x) {
m_cap = x.m_cap;
m_siz = x.m_siz;
m_p = x.m_p;
x.m_p = nullptr;
x.m_cap = x.m_siz = 0;
}
Array& operator=(const Array& x) {
m_cap = m_siz = x.m_siz;
m_p = (T*)malloc(sizeof(T) * m_siz);
memcpy(m_p, x.m_p, sizeof(T) * m_siz);
}
Array& operator=(Array&& x) {
m_cap = x.m_cap;
m_siz = x.m_siz;
m_p = x.m_p;
x.m_p = nullptr;
x.m_cap = x.m_siz = 0;
}
void push_back(const T& x) {
if (m_siz == m_cap)increase(m_cap + 1);
m_p[m_siz++] = x;
}
void reserve(size_t cap) {
if (cap > m_cap)
increase(cap);
}
void resize(size_t siz, const T& val = T{}) {
if (siz > m_siz) {
if (siz > m_cap)increase(siz);
for (size_t i = m_siz; i < siz; i++)
m_p[i] = val;
}
m_siz = siz;
}
void shrink_to_fit() {
m_p = realloc(m_p, m_siz * sizeof(T));
m_cap = m_siz;
}
T& operator[](size_t pos) { ASSERT(pos < m_siz); return m_p[pos]; }
const T& operator[](size_t pos) const { ASSERT(pos < m_siz); return m_p[pos]; }
size_t size()const { return m_siz; }
size_t capacity()const { return m_cap; }
void clear() { m_siz = 0; }
};
#define N 10000
#define M 100000
template<typename T>
int test() {
int t = clock();
T v;
v.reserve(M);
for (int t = 0; t < N; t++) {
for (int i = 0; i < M; i++)
v.push_back(i);
//v.resize(M);
//for (int i = 0; i < M; i++)
//v[i] = -i;
v.clear();
}
return clock() - t;
}
int main() {
int t1 = test<std::vector<int>>();
int t2 = test<Array<int>>();
printf("vector:%dms.\nArray:%dms.\n", t1, t2);
}
result(Release in VS2019):
vector:1043ms.
Array:547ms.
This is the invoke chain of std::vector::pushback
(MSVC):
void push_back(const _Ty& _Val) { // insert by moving into element at end, provide strong guarantee
emplace_back(_STD move(_Val));
}
template <class... _Valty>
decltype(auto) emplace_back(_Valty&&... _Val) {
// insert by perfectly forwarding into element at end, provide strong guarantee
auto& _My_data = _Mypair._Myval2;
pointer& _Mylast = _My_data._Mylast;
if (_Mylast != _My_data._Myend) {
return _Emplace_back_with_unused_capacity(_STD forward<_Valty>(_Val)...);
}
_Ty& _Result = *_Emplace_reallocate(_Mylast, _STD forward<_Valty>(_Val)...);
#if _HAS_CXX17
return _Result;
#else // ^^^ _HAS_CXX17 ^^^ // vvv !_HAS_CXX17 vvv
(void) _Result;
#endif // _HAS_CXX17
}
template <class... _Valty>
decltype(auto) _Emplace_back_with_unused_capacity(_Valty&&... _Val) {
// insert by perfectly forwarding into element at end, provide strong guarantee
auto& _My_data = _Mypair._Myval2;
pointer& _Mylast = _My_data._Mylast;
_STL_INTERNAL_CHECK(_Mylast != _My_data._Myend); // check that we have unused capacity
_Alty_traits::construct(_Getal(), _Unfancy(_Mylast), _STD forward<_Valty>(_Val)...);
_Orphan_range(_Mylast, _Mylast);
_Ty& _Result = *_Mylast;
++_Mylast;
#if _HAS_CXX17
return _Result;
#else // ^^^ _HAS_CXX17 ^^^ // vvv !_HAS_CXX17 vvv
(void) _Result;
#endif // _HAS_CXX17
}
After inline and other optimizations, the STL code doesn't seem to make much difference from mine.
Could anyone tell me why is my push_back is more efficient(or why STL is slow)?
EDIT: Sorry, my question seemd to caused a misunderstanding.
I know that std::vector
is general, so it might do more work than mine.
But in a push_back
of type int
, I don't think std::vector
did something special, so why it is slow?
Upvotes: 0
Views: 764
Reputation: 38260
It is more that MSVC's STL has a suboptimal implementation. It could also be a suboptimal compiler optimizer of the visual studio compiler.
See: https://godbolt.org/z/rT5Kaao7G for a small testcase:
#include <vector>
using namespace std;
void foo(vector<int> &vec, const vector<int> &input) {
vec.reserve(10000);
for (int i : input) {
vec.emplace_back(i);
}
}
gcc optimizes it better than MSVC. gcc is so clever that it converts the entire for loop into a memmove
.
.L34:
mov rsi, r14
mov rdi, rax
call memmove
mov rsi, QWORD PTR [rbp+16]
sub rsi, r14
MSVC calls the emblace_back
implementation (_Emplace_reallocate
) in a loop which is not something i'd expect from a modern optimizing compiler.
$LL4@foo:
mov rdx, QWORD PTR [rdi+8]
mov eax, DWORD PTR [rbx]
mov DWORD PTR i$1[rsp], eax
cmp rdx, QWORD PTR [rdi+16]
je SHORT $LN9@foo
mov DWORD PTR [rdx], eax
add QWORD PTR [rdi+8], 4
jmp SHORT $LN2@foo
$LN9@foo:
lea r8, QWORD PTR i$1[rsp]
mov rcx, rdi
call int * std::vector<int,std::allocator<int> >::_Emplace_reallocate<int &>(int * const,int &) ; std::vector<int,std::allocator<int> >::_Emplace_reallocate<int &>
$LN2@foo:
add rbx, 4
cmp rbx, rsi
jne SHORT $LL4@foo
Upvotes: 0
Reputation: 85541
The difference is mainly due to less favorable codegen in the std::vector
version. We can see it if we compare the generated assembly (godbolt link).
Your loop (skipping the reallocation part):
$LL4@test:
xor esi, esi
xor edi, edi
$LL7@test:
mov r15, rbx
cmp rdi, rbx
jne SHORT $LN27@test
<...skip...>
$LN27@test:
mov DWORD PTR [r14+rdi*4], esi
inc rdi
inc esi
cmp esi, 100000 ; 000186a0H
jl $LL7@test
sub r12, r13
jne $LL4@test
std::vector::push_back
loop (again, skipping the reallocation part):
$LL4@test:
xor ebx, ebx
mov DWORD PTR i$1[rsp], ebx
$LL7@test:
cmp rcx, QWORD PTR v$[rsp+16]
je SHORT $LN26@test
mov DWORD PTR [rcx], ebx
mov rcx, QWORD PTR v$[rsp+8]
add rcx, 4
mov QWORD PTR v$[rsp+8], rcx
jmp SHORT $LN5@test
$LN26@test:
<...skip...>
$LN5@test:
inc ebx
mov DWORD PTR i$1[rsp], ebx
cmp ebx, 100000 ; 000186a0H
jl SHORT $LL7@test
mov rcx, QWORD PTR v$[rsp]
mov QWORD PTR v$[rsp+8], rcx
sub rdi, 1
jne SHORT $LL4@test
Clearly we can see more code (11 vs. 8 instructions in the hot path) and more indirection to memory (5 vs. 1). So it's no surprise that it's slower.
More generally, more complex code == slower code.
Could the two versions be optimized identically? I see no reason why not. MSVC 19.28 just can't do it.
Upvotes: 1
Reputation: 238491
It is entirely possible that the measured difference is incidental, due to a different layout in memory. The layout has a huge effect on the performance, and it can be affected by many things besides the implementation of the vector.
I ran your benchmark on another system, and your vector took 31% longer time to complete than the standard one. Admittedly, the vector implementation was different - libstdc++. But this doesn't mean that libstdc++ vector is faster than your vector which is faster than MSVC vector. The benchmark simply isn't intricate enough to determine that with confidence.
Upvotes: 2
Reputation: 63402
Your version is only valid for trivially copyable types.
Because reallocation may involve bytewise copying (regardless of whether it's to expand or to contract), only the objects of TriviallyCopyable types are safe to access in the preserved part of the memory block after a call to
realloc
.
The ordinary std::vector
has a very lax requirement in comparions, merely Destructible in general, and one of CopyInsertable or MoveInsertable for push_back
.
Upvotes: 2