Reputation: 1809
I have a std::vector<std::vector<double>>
that I am trying to convert to a single contiguous vector as fast as possible. My vector has a shape of roughly 4000 x 50
.
The problem is, sometimes I need my output vector in column-major contiguous order (just concatenating the interior vectors of my 2d input vector), and sometimes I need my output vector in row-major contiguous order, effectively requiring a transpose.
I have found that a naive for loop is quite fast for conversion to a column-major vector:
auto to_dense_column_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
for (size_t i = 0; i < n_col; ++i)
for (size_t j = 0; j < n_row; ++j)
out_vec[i * n_row + j] = vec[i][j];
return out_vec;
}
But obviously a similar approach is very slow for row-wise conversion, because of all of the cache misses. So for row-wise conversion, I thought a blocking strategy to promote cache locality might be my best bet:
auto to_dense_row_major_blocking(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
size_t block_side = 8;
for (size_t l = 0; l < n_col; l += block_side) {
for (size_t k = 0; k < n_row; k += block_side) {
for (size_t j = l; j < l + block_side && j < n_col; ++j) {
auto const &column = vec[j];
for (size_t i = k; i < k + block_side && i < n_row; ++i)
out_vec[i * n_col + j] = column[i];
}
}
}
return out_vec;
}
This is considerably faster than a naive loop for row-major conversion, but still almost an order of magnitude slower than naive column-major looping on my input size.
My question is, is there a faster approach to converting a (column-major) vector of vectors of doubles to a single contiguous row-major vector? I am struggling to reason about what the limit of speed of this code should be, and am thus questioning whether I'm missing something obvious. My assumption was that blocking would give me a much larger speedup then it appears to actually give.
The chart was generated using QuickBench (and somewhat verified with GBench locally on my machine) with this code: (Clang 7, C++20, -O3)
auto to_dense_column_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
for (size_t i = 0; i < n_col; ++i)
for (size_t j = 0; j < n_row; ++j)
out_vec[i * n_row + j] = vec[i][j];
return out_vec;
}
auto to_dense_row_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
for (size_t i = 0; i < n_col; ++i)
for (size_t j = 0; j < n_row; ++j)
out_vec[j * n_col + i] = vec[i][j];
return out_vec;
}
auto to_dense_row_major_blocking(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
size_t block_side = 8;
for (size_t l = 0; l < n_col; l += block_side) {
for (size_t k = 0; k < n_row; k += block_side) {
for (size_t j = l; j < l + block_side && j < n_col; ++j) {
auto const &column = vec[j];
for (size_t i = k; i < k + block_side && i < n_row; ++i)
out_vec[i * n_col + j] = column[i];
}
}
}
return out_vec;
}
auto to_dense_column_major_blocking(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
size_t block_side = 8;
for (size_t l = 0; l < n_col; l += block_side) {
for (size_t k = 0; k < n_row; k += block_side) {
for (size_t j = l; j < l + block_side && j < n_col; ++j) {
auto const &column = vec[j];
for (size_t i = k; i < k + block_side && i < n_row; ++i)
out_vec[j * n_row + i] = column[i];
}
}
}
return out_vec;
}
auto make_vecvec() -> std::vector<std::vector<double>>
{
std::vector<std::vector<double>> vecvec(50, std::vector<double>(4000));
std::mt19937 mersenne {2019};
std::uniform_real_distribution<double> dist(-1000, 1000);
for (auto &vec: vecvec)
for (auto &val: vec)
val = dist(mersenne);
return vecvec;
}
static void NaiveColumnMajor(benchmark::State& state) {
// Code before the loop is not measured
auto vecvec = make_vecvec();
for (auto _ : state) {
benchmark::DoNotOptimize(to_dense_column_major_naive(vecvec));
}
}
BENCHMARK(NaiveColumnMajor);
static void NaiveRowMajor(benchmark::State& state) {
// Code before the loop is not measured
auto vecvec = make_vecvec();
for (auto _ : state) {
benchmark::DoNotOptimize(to_dense_row_major_naive(vecvec));
}
}
BENCHMARK(NaiveRowMajor);
static void BlockingRowMajor(benchmark::State& state) {
// Code before the loop is not measured
auto vecvec = make_vecvec();
for (auto _ : state) {
benchmark::DoNotOptimize(to_dense_row_major_blocking(vecvec));
}
}
BENCHMARK(BlockingRowMajor);
static void BlockingColumnMajor(benchmark::State& state) {
// Code before the loop is not measured
auto vecvec = make_vecvec();
for (auto _ : state) {
benchmark::DoNotOptimize(to_dense_column_major_blocking(vecvec));
}
}
BENCHMARK(BlockingColumnMajor);
Upvotes: 12
Views: 1036
Reputation: 17023
First of all, I cringe whenever something is qualified as "obviously". That word is often used to cover up a shortcoming in one's deductions.
But obviously a similar approach is very slow for row-wise conversion, because of all of the cache misses.
I'm not sure which is supposed to be obvious: that the row-wise conversion would be slow, or that it's slow because of cache misses. In either case, I find it not obvious. After all, there are two caching considerations here, aren't there? One for reading and one for writing? Let's look at the code from the reading perspective:
row_major_naive
for (size_t i = 0; i < n_col; ++i)
for (size_t j = 0; j < n_row; ++j)
out_vec[j * n_col + i] = vec[i][j];
Successive reads from vec
are reads of contiguous memory: vec[i][0]
followed by vec[i][1]
, etc. Very good for caching. So... cache misses? Slow? :) Maybe not so obvious.
Still, there is something to be gleaned from this. The claim is only wrong by claiming "obviously". There are non-locality issues, but they occur on the writing end. (Successive writes are offset by the space for 50 double
values.) And empirical testing confirms the slowness. So maybe a solution is to flip on what is considered "obvious"?
row major flipped
for (size_t j = 0; j < n_row; ++j)
for (size_t i = 0; i < n_col; ++i)
out_vec[j * n_col + i] = vec[i][j];
All I did here was reverse the loops. Literally swap the order of those two lines of code then adjust the indentation. Now successive reads are potentially all over the place, as they read from different vectors. However, successive writes are now to contiguous blocks of memory. In one sense, we are in the same situation as before. But just like before, one should measure performance before assuming "fast" or "slow".
NaiveColumnMajor: 3.4 seconds
NaiveRowMajor: 7.7 seconds
FlippedRowMajor: 4.2 seconds
BlockingRowMajor: 4.4 seconds
BlockingColumnMajor: 3.9 seconds
Still slower than the naive column major conversion. However, this approach is not only faster than naive row major, but it's also faster than blocking row major. At least on my computer (using gcc -O3
and obviously :P iterating thousands of times). Mileage may vary. I don't know what the fancy profiling tools would say. The point is that sometimes simpler is better.
For funsies I did a test where the dimensions are swapped (changing from 50 vectors of 4000 elements to 4000 vectors of 50 elements). All methods got hurt this way, but "NaiveRowMajor" took the biggest hit. Worth noting is that "flipped row major" fell behind the blocking version. So, as one might expect, the best tool for the job depends on what exactly the job is.
NaiveColumnMajor: 3.7 seconds
NaiveRowMajor: 16 seconds
FlippedRowMajor: 5.6 seconds
BlockingRowMajor: 4.9 seconds
BlockingColumnMajor: 4.5 seconds
(By the way, I also tried the flipping trick on the blocking version. The change was small -- around 0.2 -- and opposite of flipping the naive version. That is, "flipped blocking" was slower than "blocking" for the question's 50-of-4000 vectors, but faster for my 4000-of-50 variant. Fine tuning might improve the results.)
Update: I did a little more testing with the flipping trick on the blocking version. This version has four loops, so "flipping" is not as straight-forward as when there are only two loops. It looks like swapping the order of the outer two loops is bad for performance, while swapping the inner two loops is good. (Initially, I had done both and gotten mixed results.) When I swapped just the inner loops, I measured 3.8 seconds (and 4.1 seconds in the 4000-of-50 scenario), making this the best row-major option in my tests.
row major hybrid
for (size_t l = 0; l < n_col; l += block_side)
for (size_t i = 0; i < n_row; ++i)
for (size_t j = l; j < l + block_side && j < n_col; ++j)
out_vec[i * n_col + j] = vec[j][i];
(After swapping the inner loops, I merged the middle loops.)
As for the theory behind this, I would guess that this amounts to trying to write one cache block at a time. Once a block is written, try to re-use vectors (the vec[j]
) before they get ejected from the cache. After you exhaust those source vectors, move on to a new group of source vectors, again writing full blocks at a time.
Upvotes: 7
Reputation: 1
I have just added two functions of parallel version of things
#include <ppl.h>
auto ppl_to_dense_column_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
size_t vecLen = out_vec.size();
concurrency::parallel_for(size_t(0), vecLen, [&](size_t i)
{
size_t row = i / n_row;
size_t column = i % n_row;
out_vec[i] = vec[row][column];
});
return out_vec;
}
auto ppl_to_dense_row_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
auto n_col = vec.size();
auto n_row = vec[0].size();
std::vector<double> out_vec(n_col * n_row);
size_t vecLen = out_vec.size();
concurrency::parallel_for(size_t(0), vecLen, [&](size_t i)
{
size_t column = i / n_col;
size_t row = i % n_col;
out_vec[i] = vec[row][column];
});
return out_vec;
}
and additional benchmark codes for all of them
template< class _Fn, class ... Args >
auto callFncWithPerformance( std::string strFnName, _Fn toCall, Args&& ...args )
{
auto start = std::chrono::high_resolution_clock::now();
auto toRet = toCall( std::forward<Args>(args)... );
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
std::cout << strFnName << ": " << diff.count() << " s" << std::endl;
return toRet;
}
template< class _Fn, class ... Args >
auto second_callFncWithPerformance(_Fn toCall, Args&& ...args)
{
std::string strFnName(typeid(toCall).name());
auto start = std::chrono::high_resolution_clock::now();
auto toRet = toCall(std::forward<Args>(args)...);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
std::cout << strFnName << ": " << diff.count() << " s";
return toRet;
}
#define MAKEVEC( FN, ... ) callFncWithPerformance( std::string( #FN ) , FN , __VA_ARGS__ )
int main()
{
//prepare vector
auto vec = make_vecvec();
std::vector< double > vecs[]
{
std::vector<double>(MAKEVEC(to_dense_column_major_naive, vec)),
std::vector<double>(MAKEVEC(to_dense_row_major_naive, vec)),
std::vector<double>(MAKEVEC(ppl_to_dense_column_major_naive, vec)),
std::vector<double>(MAKEVEC(ppl_to_dense_row_major_naive, vec)),
std::vector<double>(MAKEVEC(to_dense_row_major_blocking, vec)),
std::vector<double>(MAKEVEC(to_dense_column_major_blocking, vec)),
};
//system("pause");
return 0;
}
and here below result of these
Debug x64
to_dense_column_major_naive: 0.166859 s
to_dense_row_major_naive: 0.192488 s
ppl_to_dense_column_major_naive: 0.0557423 s
ppl_to_dense_row_major_naive: 0.0514017 s
to_dense_column_major_blocking: 0.118465 s
to_dense_row_major_blocking: 0.117732 sDebug x86
to_dense_column_major_naive: 0.15242 s
to_dense_row_major_naive: 0.158746 s
ppl_to_dense_column_major_naive: 0.0534966 s
ppl_to_dense_row_major_naive: 0.0484076 s
to_dense_column_major_blocking: 0.111217 s
to_dense_row_major_blocking: 0.107727 sRelease x64
to_dense_column_major_naive: 0.000874 s
to_dense_row_major_naive: 0.0011973 s
ppl_to_dense_column_major_naive: 0.0054639 s
ppl_to_dense_row_major_naive: 0.0012034 s
to_dense_column_major_blocking: 0.0008023 s
to_dense_row_major_blocking: 0.0010282 sRelease x86
to_dense_column_major_naive: 0.0007156 s
to_dense_row_major_naive: 0.0012538 s
ppl_to_dense_column_major_naive: 0.0053351 s
ppl_to_dense_row_major_naive: 0.0013022 s
to_dense_column_major_blocking: 0.0008761 s
to_dense_row_major_blocking: 0.0012404 s
You are quite right, to parallel it is too small set of data.
And also it is too small works.
Although I will be post for someone else to reference these functions.
Upvotes: 0