Reputation: 1331
Using Armadillo
I wrote a matrix-vector multiplication and a linear system solve. Armadillo is compiled from source and uses OpenBLAS
, also compiled from source. Unfortunately I am getting inconsistent results for single-threaded and multi-threaded runs. The matrix-vector multiplication runs faster on a single-thread while the linear system solve runs faster when multi-threading. I was hoping if someone could give me some pointers on what I am doing wrong.
See below:
matmul_armadillo.cpp
#include <armadillo>
using namespace arma;
int main(int argc, char *argv[])
{
const int n = atoi(argv[1]);
mat A = randu<mat>(n, n);
vec x = randu<vec>(n);
A*x;
return 0;
}
solve_armadillo.cpp
#include <armadillo>
using namespace arma;
int main(int argc, char *argv[])
{
const int n = atoi(argv[1]);
mat A = randu<mat>(n, n);
vec b = randu<vec>(n);
vec x;
x = solve(A, b);
return 0;
}
benchmark.sh
#!/bin/bash
g++ matmul_armadillo.cpp -o matmul_armadillo -O3 -march=native -std=c++11 -larmadillo
g++ solve_armadillo.cpp -o solve_armadillo -O3 -march=native -std=c++11 -larmadillo
N=7500
export OPENBLAS_NUM_THREADS=1
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=2
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=3
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=4
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=5
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=6
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=7
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=8
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
Results
$ ./benchmark.sh
Running matmul_armadillo on 1 threads
real 0m0.943s
user 0m0.628s
sys 0m0.159s
Running solve_armadillo on 1 threads
real 0m13.910s
user 0m13.553s
sys 0m0.300s
Running matmul_armadillo on 2 threads
real 0m1.528s
user 0m1.361s
sys 0m0.402s
Running solve_armadillo on 2 threads
real 0m15.815s
user 0m29.097s
sys 0m1.083s
Running matmul_armadillo on 3 threads
real 0m1.534s
user 0m1.480s
sys 0m0.533s
Running solve_armadillo on 3 threads
real 0m11.729s
user 0m31.022s
sys 0m1.290s
Running matmul_armadillo on 4 threads
real 0m1.543s
user 0m1.619s
sys 0m0.674s
Running solve_armadillo on 4 threads
real 0m10.013s
user 0m34.055s
sys 0m1.696s
Running matmul_armadillo on 5 threads
real 0m1.545s
user 0m1.620s
sys 0m0.664s
Running solve_armadillo on 5 threads
real 0m9.945s
user 0m33.803s
sys 0m1.669s
Running matmul_armadillo on 6 threads
real 0m1.543s
user 0m1.607s
sys 0m0.684s
Running solve_armadillo on 6 threads
real 0m10.069s
user 0m34.283s
sys 0m1.699s
Running matmul_armadillo on 7 threads
real 0m1.542s
user 0m1.622s
sys 0m0.661s
Running solve_armadillo on 7 threads
real 0m10.041s
user 0m34.154s
sys 0m1.704s
Running matmul_armadillo on 8 threads
real 0m1.546s
user 0m1.576s
sys 0m0.712s
Running solve_armadillo on 8 threads
real 0m10.123s
user 0m34.492s
sys 0m1.697s
System information
Upvotes: 0
Views: 1698
Reputation: 1183
I suspect that
A*x;
may have been optimized away because you do not do anything with the result. The delayed evaluation template magic of the multiplication operation in Armadillo can easily bring about that the Lapack routine for the computation is never called. So if you enable threading, you only measure the overhead of setting that up. Hence your program executes more quickly with threading disabled.
With
x = solve(A, b);
it is different as that leads pretty directly to the respective Lapack call, which probably cannot be optimized away as the compiler cannot rule out side effects and you actually assign the result to a variable. The solve
call benefits from multiprocessing for such large matrices.
To fix your benchmark your should do two things:
Here is an untested example:
#include <iostream>
#include <armadillo>
using namespace arma;
int main(int argc, char *argv[])
{
const int n = atoi(argv[1]);
mat A = randu<mat>(n, n);
vec x = randu<vec>(n);
for (int i = 0; i < 100; ++i) {
x = A*x;
}
x.print(std::cout);
return 0;
}
The print
call may not be necessary.
Upvotes: 2