Reputation: 748
There are a bunch of questions that ask "why is my parallel loop slower than the sequential one" to which answer is "the work needed inside the loop is low; try to make more iterations", like this one.
I have a loop that each iteration takes about 0.5 minute. I thought that this loop is "heavy" enough that any overhead associated with threading would be insignificant. Turns out that the parallel execution is slower. My code is in C and I am using OpenMP to parallelize. The structure of my code is as follows:
int main() {
malloc and calculate group G1 of arrays;
#pragma omp parallel
{
printf("avaible threads: %i\n", omp_get_num_threads());
malloc group G2 of arrays;
#pragma omp for
for (int i = 0; i < N; i++) {
calculate G2 arrays' elements through numerical integration;
do some linear algebra with G1 and G2 to assemble the system Ax=b;
solve Ax = b;
}
}
return 0;
}
Some clarifications:
For N=6, the serial execution takes 3 minutes, while the parallel (3 threads) takes 4 minutes. For N=30, serial is 15 minutes and parallel (3 threads) 17 minutes.
I am trying to formulate an hypothesis to why this is happening. Maybe it has something to do with CPU cache and the size of my arrays?
Here is some info of the computer I am using:
Linux 4.15.0-74-generic #84-Ubuntu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 1
Core(s) per socket: 4
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 58
Model name: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
Stepping: 9
CPU MHz: 3392.363
CPU max MHz: 3400,0000
CPU min MHz: 1600,0000
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 8192K
NUMA node0 CPU(s): 0-3
The code below demonstrates what I am doing. I tried to make it as simple and small as possible. It depends on Netlib's LAPACK and OpenBLAS (built with USE_OPENMP=1). My actual programs has thousands of lines.
To compile:
gcc -O2 -m64 -fopenmp test.c -lopenblas -llapack -lpthread -lm
To run:
export OMP_NUM_THREADS=3
./a.out 30
// test.c
#include <complex.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>
// LAPACK routines
extern void
zgesv_ (int *n, int *nrhs, _Complex double *a, int *lda, int* ipiv,
_Complex double *b, int *ldb, int* info);
extern void
zgemv_ (char *trans, int *m, int *n, _Complex double *alpha, _Complex double *a,
int *lda, _Complex double *x, int *incx, _Complex double *beta,
_Complex double *y, int *incy);
int
main (int argc, char **argv)
{
srand(300);
if (argc != 2) {
printf("must have 1 argument: number of iterations!\n");
exit(1);
}
const int nf = atoi(argv[1]);
printf("loop will have %i iterations\n", nf);
clock_t begin, end;
double time_spent;
begin = clock();
int ne = 2296;
int ne2 = ne * ne;
_Complex double* restrict pot = malloc(ne2 * sizeof(_Complex double));
for (int i = 0; i < ne; i++) {
for (int k = i; k < ne; k++) {
pot[i * ne + k] = (double) rand() / RAND_MAX;
pot[i * ne + k] *= I;
pot[i * ne + k] += (double) rand() / RAND_MAX;
pot[k * ne + i] = pot[i * ne + k];
}
}
char trans = 'N';
_Complex double one = 1.0;
_Complex double zero = 0.0;
#pragma omp parallel
{
int n = ne;
int ipiv[n]; //pivot indices
int info;
int nrhs = 1;
#pragma omp single
{
printf("avaible threads: %i\n", omp_get_num_threads());
}
_Complex double* restrict zl = malloc(ne2 * sizeof(_Complex double));
_Complex double* restrict ie = malloc(ne2 * sizeof(_Complex double));
_Complex double gpr;
#pragma omp for
for (int i = 0; i < nf; i++) {
printf("i = %i from thread %d\n", i, omp_get_thread_num());
for (int m = 0; m < ne; m++) {
for (int k = m; k < ne; k++) {
gpr = cexp(k - m);
zl[m * ne + k] = gpr * pot[m * ne + k];
zl[k * ne + m] = zl[m * ne + k];
}
}
ie[0] = 1.0;
for (int m = 1; m < ne; m++) {
ie[m] = 0.0;
}
zgesv_(&n, &nrhs, zl, &n, ipiv, ie, &n, &info);
// Check for the exact singularity
if (info > 0) {
printf("The diagonal element of the triangular factor of ZL,\n");
printf("U(%i,%i) is zero, so that ZL is singular;\n", info, info);
printf("the solution could not be computed.\n");
}
zgemv_(&trans, &ne, &ne, &one, zl, &ne, ie, &nrhs, &zero, ie, &nrhs);
for (int p = 0; p < ne2; p++) {
gpr = 0.0;
for (int m = 0; m < ne; m++) {
gpr += ie[m] * cexp(-m * 5.4) / 4.1;
}
}
}
free(zl);
free(ie);
}
free(pot);
end = clock();
time_spent = (double) (end - begin) / CLOCKS_PER_SEC;
printf("Success. Elapsed time: %.2f minutes\n", time_spent / 60.0);
return 0;
}
Upvotes: 4
Views: 455
Reputation: 748
I had a professor who said
If the experimental results did not agree with the theoretical ones, a question you must ask is How are you measuring?
As @Gilles pointed out in the comments to the question: use omp_get_wtime()
instead of clock()
. That is because clock()
returns the accumulated CPU time across all threads, while omp_get_wtime()
returns the wall clock time (see the OpenMP Forums).
Upvotes: 4