Reputation: 209
My objective is to investigate the CPU time discrepancies I observe between static and dynamic allocation, depending whether memory is accessed contiguously or not.
In order to make this investigation as sound as possible, I led it with both C++ and Fortran programs. Those are as simple as possible, the core part consists in computing one matrix multiplication from two randomly filled ones. Here is the C++ code:
#include <iostream>
#include <iomanip>
#include <sstream>
#include <random>
#include <string>
#include <chrono>
#include <ctime>
using namespace std;
#ifdef ALLOCATION_DYNAMIC
//
// Use a home made matrix class when dynamically allocating.
//
class matrix
{
private:
int n_;
int m_;
double *data_;
public:
matrix();
~matrix();
double* operator[](int i);
void resize(int n, int m);
double& operator()(int i, int j);
const double& operator()(int i, int j) const;
};
matrix::matrix() : n_(0), m_(0), data_(NULL)
{
return;
}
matrix::~matrix()
{
if (data_) delete[] data_;
return;
}
void matrix::resize(int n, int m)
{
if (data_) delete[] data_;
n_ = n;
m_ = m;
data_ = new double[n_ * m_];
}
inline double& matrix::operator()(int i, int j)
{
return *(data_ + i * m_ + j);
}
inline const double& matrix::operator()(int i, int j) const
{
return *(data_ + i * m_ + j);
}
#endif
// Record the optimization flag we were compiled with.
string optflag = OPTFLAG;
//
// Main program.
//
int main(int argc, char *argv[])
{
cout << "optflag " << optflag;
#ifdef ALLOCATION_DYNAMIC
int n = N;
matrix cc1;
matrix cc2;
matrix cc3;
#else
const int n = N;
// It is necessary to specify the static keyword
// because the default is "automatic", so that
// data is entirely put on the stack which quickly
// get overflowed with greater N values.
static double cc1[N][N];
static double cc2[N][N];
static double cc3[N][N];
#endif
cout << " allocation ";
#ifdef ALLOCATION_DYNAMIC
cout << "dynamic";
if (argc > 1)
{
istringstream iss(argv[1]);
iss >> n;
}
cc1.resize(n, n);
cc2.resize(n, n);
cc3.resize(n, n);
#else
cout << "static";
#endif
cout << " N " << n << flush;
// Init.
string seed = SEED;
std::seed_seq seed_sequence (seed.begin(), seed.end());
// Standard, 64 bit based, Mersenne Twister random engine.
std::mt19937_64 generator (seed_sequence);
// Random number between [0, 1].
std::uniform_real_distribution<double> random_unity(double(0), double(1));
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
#ifdef ALLOCATION_DYNAMIC
cc1(i, j) = random_unity(generator);
cc2(i, j) = random_unity(generator);
cc3(i, j) = double(0);
#else
cc1[i][j] = random_unity(generator);
cc2[i][j] = random_unity(generator);
cc3[i][j] = double(0);
#endif
}
clock_t cpu_begin = clock();
auto wall_begin = std::chrono::high_resolution_clock::now();
cout << " transpose ";
#ifdef TRANSPOSE
cout << "yes";
// Transpose.
for (int i = 0; i < n; ++i)
for (int j = 0; j < i; ++j)
{
#ifdef ALLOCATION_DYNAMIC
double tmp = cc2(i, j);
cc2(i, j) = cc2(j, i);
cc2(j, i) = tmp;
#else
double tmp = cc2[i][j];
cc2[i][j] = cc2[j][i];
cc2[j][i] = tmp;
#endif
}
#else
cout << "no";
#endif
cout << flush;
// Work.
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
for (int k = 0; k < n; ++k)
{
#if defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
cc3(i, j) += cc1(i, k) * cc2(j, k);
#elif defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
cc3(i, j) += cc1(i, k) * cc2(k, j);
#elif ! defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
cc3[i][j] += cc1[i][k] * cc2[j][k];
#elif ! defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
cc3[i][j] += cc1[i][k] * cc2[k][j];
#else
#error("Wrong preprocess instructions.");
#endif
}
clock_t cpu_end = clock();
auto wall_end = std::chrono::high_resolution_clock::now();
double sum(0);
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
#ifdef ALLOCATION_DYNAMIC
sum += cc3(i, j);
#else
sum += cc3[i][j];
#endif
}
sum /= double(n * n);
cout << " cpu " << setprecision(16) << double(cpu_end - cpu_begin) / double(CLOCKS_PER_SEC)
<< " wall " << setprecision(16) << std::chrono::duration<double>(wall_end - wall_begin).count()
<< " sum " << setprecision(16) << sum << endl;
return 0;
}
Here is the Fortran code :
program Test
#ifdef ALLOCATION_DYNAMIC
integer :: n = N
double precision, dimension(:,:), allocatable :: cc1
double precision, dimension(:,:), allocatable :: cc2
double precision, dimension(:,:), allocatable :: cc3
#else
integer, parameter :: n = N
double precision, dimension(n,n) :: cc1
double precision, dimension(n,n) :: cc2
double precision, dimension(n,n) :: cc3
#endif
character(len = 5) :: optflag = OPTFLAG
character(len = 8) :: time = SEED
#ifdef ALLOCATION_DYNAMIC
character(len = 10) :: arg
#endif
#ifdef TRANSPOSE
double precision :: tmp
#endif
double precision :: sum
double precision :: cpu_start, cpu_end, wall_start, wall_end
integer :: clock_reading, clock_rate, clock_max
integer :: i, j, k, s
double precision, dimension(2) :: harvest
integer, dimension(:), allocatable :: seed
write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
if (iargc().gt.0) then
call getarg(1, arg)
read(arg, '(I8)') n
end if
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n
#ifdef ALLOCATION_DYNAMIC
allocate(cc1(n, n))
allocate(cc2(n, n))
allocate(cc3(n, n))
#endif
! Init.
call random_seed(size = s)
allocate(seed(s))
seed = 0
read(time(1:2), '(I2)') seed(1)
read(time(4:5), '(I2)') seed(2)
read(time(7:8), '(I2)') seed(3)
call random_seed(put = seed)
deallocate(seed)
do i = 1, n
do j = 1, n
call random_number(harvest)
cc1(i, j) = harvest(1)
cc2(i, j) = harvest(2)
cc3(i, j) = dble(0)
enddo
enddo
write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
#ifdef TRANSPOSE
write(6, FMT = '(A)', ADVANCE = 'NO') "yes"
! Transpose.
do j = 1, n
do i = 1, j - 1
tmp = cc1(i, j)
cc1(i, j) = cc1(j, i)
cc1(j, i) = tmp
enddo
enddo
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "no"
#endif
call cpu_time(cpu_start)
call system_clock (clock_reading, clock_rate, clock_max)
wall_start = dble(clock_reading) / dble(clock_rate)
! Work.
do j = 1, n
do i = 1, n
do k = 1, n
#ifdef TRANSPOSE
cc3(i, j) = cc3(i, j) + cc1(k, i) * cc2(k, j)
#else
cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
#endif
enddo
enddo
enddo
sum = dble(0)
do j = 1, n
do i = 1, n
sum = sum + cc3(i, j)
enddo
enddo
sum = sum / (n * n)
call cpu_time(cpu_end)
call system_clock (clock_reading, clock_rate, clock_max)
wall_end = dble(clock_reading) / dble(clock_rate)
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
write(6, FMT = '(A,F23.16)') " sum ", sum
#ifdef ALLOCATION_DYNAMIC
deallocate(cc1)
deallocate(cc2)
deallocate(cc3)
#endif
end program Test
I tried to make both programs as similar as possible, taking into account that C/C++ is row order major whereas Fortran is column order major.
Whenever possible, matrices are read contiguously, the exception is the matrix multiplication because when performing C = A x B, A is usually read by row whereas B is read by column.
Both programs can be compiled either by letting one of the matrix, A or B depending on the language, being accessed no sequentially, or by transposing matrix A or B so that it is then read contiguously during the matrix multiplication, which is achieved by passing the TRANSPOSE preprocess instruction.
The following lines give all the details for the compilation process ((GCC) 4.8.1 ) :
gfortran -o f90-dynamic -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" -DALLOCATION_DYNAMIC src/test.f90
gfortran -o f90-static -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" src/test.f90
g++ -o cpp-dynamic -Wall -pedantic -std=gnu++0x -O3 -DALLOCATION_DYNAMIC -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp
g++ -o cpp-static -Wall -pedantic -std=gnu++0x -O3 -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp
These four lines produce four programs in which A or B matrices are initially transposed.The N preprocess instruction initializes the default matrix size which has to be known at compile time when using static fields. That is to note all programs are compiled with the highest optimization degree (O3) I know so far.
I ran all generated programs for various matrix sizes, from 1000 to 5000. Results are plotted in the following figures, one for each case (transpose or not) :
The host system is
Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
and the stack size is (ulimit -s) 10240.
For each point, I ran several times the same program until the standard deviation of CPU time becomes negligible compared to its average. Squares and circles stand respectively for Fortran and C++, red and green for dynamic or static.
In the transpose test, computation times are very close, especially the main difference comes from the language (Fortran vs. C++), the dynamic vs. static allocation makes nearly no difference. However, static allocation seems faster, especially for C++.
In the no-transpose test, computation times are significantly greater, which was expected as it is slower to access memory not sequentially, but the CPU time behaves differently than before:
I would like to understand what happens in the no-transpose test:
Your help would be greatly appreciated as the team in which I work faces the same problem : a significant CPU time increase when they switched from static to dynamic allocation in a (much bigger) Fortran program.
Upvotes: 3
Views: 348
Reputation: 209
Clearly, the CPU time discrepancy between static and dynamic allocation in the notranspose test is due to
do j = 1, n
do i = 1, n
do k = 1, n
cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
enddo
enddo
enddo
for the Fortran 90 program and to
// Work.
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
for (int k = 0; k < n; ++k)
{
#if defined(ALLOCATION_DYNAMIC)
cc3(i, j) += cc1(i, k) * cc2(k, j);
#else
cc3[i][j] += cc1[i][k] * cc2[k][j];
#endif
}
for the C++ one.
The compiler is able to make deeper optimizations (-fopt-info-optimize
)
when using static allocation, in this case it outputs (both F90/C++):
Vectorizing loop at src/test.cpp:163
src/test.cpp:163: note: LOOP VECTORIZED.
src/test.cpp:163: note: OUTER LOOP VECTORIZED.
It does nothing for dynamic allocation, I am quite surprised of this, I do not understand why the compiler can optimize this non-contiguous memory access (cc3[k][j]
) when statically allocated and not when dynamically allocated (cc3(k, j)
).
Upvotes: 1
Reputation: 2518
The most important part should be the knowledge, which is available to the compiler.
The static version has a fixed array size which can be used by the compiler to enforce better optimizations. E.g. the distance between the rows of your matrix is fixed (cc3(n,1)
is next to cc3(1,2)
in Fortran memory), while the dynamic array could have some padding (there could be an element cc3(n+1,1)
. Actually, taking a look at the output of -fopt-info-optimized
we can see, that the loop at l.95 gets only optimized in the static case.
To check this, I modified your program to use linear arrays to represent the matrices. Timing the program, I got no significant time difference between static and dynamic allocation, and the 2d version with optimal loop order performed with the same speed.
program Test
#ifdef ALLOCATION_DYNAMIC
integer :: n = N
double precision, dimension(:), allocatable :: cc1
double precision, dimension(:), allocatable :: cc2
double precision, dimension(:), allocatable :: cc3
#else
integer, parameter :: n = N
double precision, dimension(n**2) :: cc1
double precision, dimension(n**2) :: cc2
double precision, dimension(n**2) :: cc3
#endif
character(len = 5) :: optflag = OPTFLAG
character(len = 8) :: time = SEED
#ifdef ALLOCATION_DYNAMIC
character(len = 10) :: arg
#endif
double precision :: sum
double precision :: cpu_start, cpu_end, wall_start, wall_end
integer :: clock_reading, clock_rate, clock_max
integer :: i, j, k, s
double precision, dimension(2) :: harvest
integer, dimension(:), allocatable :: seed
write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
if (iargc().gt.0) then
call getarg(1, arg)
read(arg, '(I8)') n
end if
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n
#ifdef ALLOCATION_DYNAMIC
allocate(cc1(n**2))
allocate(cc2(n**2))
allocate(cc3(n**2))
#endif
! Init.
call random_seed(size = s)
allocate(seed(s))
seed = 0
read(time(1:2), '(I2)') seed(1)
read(time(4:5), '(I2)') seed(2)
read(time(7:8), '(I2)') seed(3)
call random_seed(put = seed)
deallocate(seed)
do i = 1, n**2
call random_number(harvest)
cc1(i) = harvest(1)
cc2(i) = harvest(2)
cc3(i) = dble(0)
enddo
write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
write(6, FMT = '(A)', ADVANCE = 'NO') "no"
call cpu_time(cpu_start)
call system_clock (clock_reading, clock_rate, clock_max)
wall_start = dble(clock_reading) / dble(clock_rate)
! Work.
do j = 1, n
do i = 1, n
do k = 1, n
cc3((j-1)*n+i) = cc3((j-1)*n+i) + cc1((i-1)*n+k) * cc2((j-1)*n+k)
enddo
enddo
enddo
sum = dble(0)
do j = 1, n**2
sum = sum + cc3(i)
enddo
sum = sum / (n * n)
call cpu_time(cpu_end)
call system_clock (clock_reading, clock_rate, clock_max)
wall_end = dble(clock_reading) / dble(clock_rate)
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
write(6, FMT = '(A,F23.16)') " sum ", sum
#ifdef ALLOCATION_DYNAMIC
deallocate(cc1)
deallocate(cc2)
deallocate(cc3)
#endif
end program Test
Upvotes: 1