Reputation: 3761
I want to copy a vector while setting an elements property to zero. I have a std::vector<PLY>
vector that holds a specific number of the following struct elements:
struct PLY{
float x;
float y;
float z;
}
What is the fastest way to create a copy of this vector where each PLY element has a z-value
of 0
? Is there a faster way then creating a copy of the vector and then iterating over each element to set the new z-value?
Upvotes: 6
Views: 5003
Reputation: 15837
There are two problems with the vector with the default allocator:
To get rid of this as discussed here a custom allocator can be used to refuse to do any initialization. When a vector is created with the desired size memcpy
or for loop can be used to copy the data:
#include <vector>
#include <cstring>
template <class T>
class no_init_alloc
: public std::allocator<T>
{
public:
using std::allocator<T>::allocator;
template <class U, class... Args> void construct(U*, Args&&...) {}
};
struct PLY
{
float x, y , z;
};
int main()
{
std::vector<PLY> source(1000000);
//create a vector with the custom allocator refusing any initialization
std::vector<PLY, no_init_alloc<PLY>> target(source.size());
//then we can use memcpy approach
{
memcpy(target.data(), source.data(), source.size() * sizeof(source.front()));
for(auto& t : target) t.z = 0.0f;
}
// or simple for loop approach
{
size_t sz = target.size();
for(size_t i = 0; i < sz; ++i) {
target[i].x = source[i].x;
target[i].y = source[i].y;
target[i].z = 0.0f;
}
}
}
using @Richard Hodges 's benchmark with -O2 optimization the result is:
CLNAG:
testing transform
sample size : 1000000
time taken : 8.363995ms
time per sample : 8.363995ns
samples per second : 119560090.602637
testing assign through custom iterator
sample size : 1000000
time taken : 7.162974ms
time per sample : 7.162974ns
samples per second : 139606816.945029
testing no_init_alloc_memcpy
sample size : 1000000
time taken : 6.918533ms
time per sample : 6.918533ns
samples per second : 144539312.018892
testing no_init_alloc_for
sample size : 1000000
time taken : 6.383721ms
time per sample : 6.383721ns
samples per second : 156648450.018414
GCC:
testing transform
sample size : 1000000
time taken : 12.083038ms
time per sample : 12.083038ns
samples per second : 82760643.473934
testing assign through custom iterator
sample size : 1000000
time taken : 6.188324ms
time per sample : 6.188324ns
samples per second : 161594641.780230
testing no_init_alloc_memcpy
sample size : 1000000
time taken : 3.000699ms
time per sample : 3.000699ns
samples per second : 333255684.758785
testing no_init_alloc_for
sample size : 1000000
time taken : 1.979482ms
time per sample : 1.979482ns
samples per second : 505182669.001284
final answer:
Use a custom non initializing allocator with simple for loop.
Upvotes: 3
Reputation: 69864
what is the fastest way to... ?
Test it. Memory architectures do surprising things.
#include <iostream>
#include <chrono>
#include <vector>
#include <iomanip>
#include <algorithm>
struct PLY
{
PLY() : x(0), y(0), z(0) {}
PLY(float x, float y, float z) : x(x), y(y), z(z) {}
float x, y , z;
};
template<class F>
std::vector<PLY> test(const char* name, std::vector<PLY> samples, F f)
{
using namespace std::literals;
std::vector<PLY> result;
result.reserve(samples.size());
auto start = std::chrono::high_resolution_clock::now();
f(result, samples);
auto end = std::chrono::high_resolution_clock::now();
using fns = std::chrono::duration<long double, std::chrono::nanoseconds::period>;
using fms = std::chrono::duration<long double, std::chrono::milliseconds::period>;
using fs = std::chrono::duration<long double, std::chrono::seconds::period>;
auto interval = fns(end - start);
auto time_per_sample = interval / samples.size();
auto samples_per_second = 1s / time_per_sample;
std::cout << "testing " << name << '\n';
std::cout << " sample size : " << samples.size() << '\n';
std::cout << " time taken : " << std::fixed << fms(interval).count() << "ms\n";
std::cout << " time per sample : " << std::fixed << (interval / samples.size()).count() << "ns\n";
std::cout << " samples per second : " << std::fixed << samples_per_second << "\n";
return result;
}
struct zero_z_iterator : std::vector<PLY>::const_iterator
{
using base_class = std::vector<PLY>::const_iterator;
using value_type = PLY;
using base_class::base_class;
value_type operator*() const {
auto const& src = base_class::operator*();
return PLY{ src.x, src.y, 0.0 };
}
};
int main()
{
test("transform", std::vector<PLY>(1000000), [](auto& target, auto& source)
{
std::transform(source.begin(), source.end(),
std::back_inserter(target),
[](auto& ply) {
return PLY { ply.x, ply.y, ply.z };
});
});
test("copy and reset z", std::vector<PLY>(1000000), [](auto& target, auto& source)
{
std::copy(source.begin(), source.end(),
std::back_inserter(target));
for (auto& x : target)
{
x.z = 0;
}
});
test("hand_roll", std::vector<PLY>(1000000), [](auto& target, auto& source)
{
for(auto& x : source) {
target.emplace_back(x.x, x.y, 0.0);
}
});
test("assign through custom iterator", std::vector<PLY>(1000000), [](auto& target, auto& source)
{
target.assign(zero_z_iterator(source.begin()),
zero_z_iterator(source.end()));
});
test("transform", std::vector<PLY>(100000000), [](auto& target, auto& source)
{
std::transform(source.begin(), source.end(),
std::back_inserter(target),
[](auto& ply) {
return PLY { ply.x, ply.y, ply.z };
});
});
test("copy and reset z", std::vector<PLY>(100000000), [](auto& target, auto& source)
{
std::copy(source.begin(), source.end(),
std::back_inserter(target));
for (auto& x : target)
{
x.z = 0;
}
});
test("hand_roll", std::vector<PLY>(100000000), [](auto& target, auto& source)
{
for(auto& x : source) {
target.emplace_back(x.x, x.y, 0.0);
}
});
test("assign through custom iterator", std::vector<PLY>(100000000), [](auto& target, auto& source)
{
target.assign(zero_z_iterator(source.begin()),
zero_z_iterator(source.end()));
});
}
testing transform
sample size : 1000000
time taken : 7.495685ms
time per sample : 7.495685ns
samples per second : 133410088.604310
testing copy and reset z
sample size : 1000000
time taken : 3.436614ms
time per sample : 3.436614ns
samples per second : 290984090.735823
testing hand_roll
sample size : 1000000
time taken : 3.289287ms
time per sample : 3.289287ns
samples per second : 304017253.587176
testing assign through custom iterator
sample size : 1000000
time taken : 2.563334ms
time per sample : 2.563334ns
samples per second : 390116933.649692
testing transform
sample size : 100000000
time taken : 768.941767ms
time per sample : 7.689418ns
samples per second : 130048859.733744
testing copy and reset z
sample size : 100000000
time taken : 880.893920ms
time per sample : 8.808939ns
samples per second : 113521046.892911
testing hand_roll
sample size : 100000000
time taken : 769.276240ms
time per sample : 7.692762ns
samples per second : 129992315.894223
testing assign through custom iterator
sample size : 100000000
time taken : 689.493098ms
time per sample : 6.894931ns
samples per second : 145034084.155546
Assign through a custom transform iterator.
template<class Container, class Iter, class TransformFunction>
void assign_transform(Container& target, Iter first, Iter last, TransformFunction func)
{
struct transform_iterator : Iter
{
using base_class = Iter;
using value_type = typename Iter::value_type;
transform_iterator(Iter base, TransformFunction& f)
: base_class(base), func(std::addressof(f))
{}
value_type operator*() const {
auto const& src = base_class::operator*();
return (*func)(src);
}
TransformFunction* func;
};
target.assign(transform_iterator(first, func),
transform_iterator(last, func));
}
use like so:
assign_transform(target, source.begin(), source.end(),
[](auto& from)
{
return PLY(from.x, from.y, 0.0);
});
Upvotes: 8
Reputation: 65610
You could use std::transform
:
std::vector<PLY> zeroed{};
zeroed.reserve(other_vec.size()); //pre-allocate the storage
std::transform(other_vec.begin(), other_vec.end(), std::back_inserter(zeroed),
[](auto e){ e.z = 0.f; return e; });
Upvotes: 8
Reputation: 182763
If there is, your compiler will probably find it. Write the code as simply and clearly as you possibly can. That will give the compiler the best chance to optimize the copy and the loop together, if that makes sense on your platform.
Upvotes: 3
Reputation: 31467
Sounds like a job for std::transform with a little lambda to do the transformation on each element.
Upvotes: 2