Reputation: 21
My task is to parallelize this function and make it faster than the sequential run time, but the #pragma omp parallel for statements I've attempted do not seem to have a substantial effect.
The sequential version of this code is essentially identical, save for the #pragma statements. I realize the code is very poorly written, it's part of the assignment, the goal of which is to achieve an 8x speed-up. The linux machine the code is to run on is an 8-core system with hyperthreading.
The methodology for testing run time is by the output of these lines of code:
clock_gettime(CLOCK_MONOTONIC, &start);
work_it_par(original, new);
clock_gettime(CLOCK_MONOTONIC, &finish);
similar code calls the sequential version of the same function, and then the speed-up is calculated by sequential time/parallel time. However, my results seem to be highly inconsistent and I cannot seem to parallelize beyond 1.5.
void work_it_par(long *old, long *new) {
int i, j, k;
int u, v, w;
long compute_it;
long aggregate=1.0;
long weNeedTheFunc = we_need_the_func();
long gimmieTheFunc = gimmie_the_func();
int marker = DIM-1;
#pragma omp parallel for private(i, j, k, compute_it)
for (i=1; i<marker; i++) {
for (j=1; j<marker; j++) {
for (k=1; k<marker; k++) {
compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;
aggregate+= compute_it / gimmieTheFunc;
}
}
}
printf("AGGR:%ld\n",aggregate);
//#pragma omp parallel for private(i, j, u, v)
for (i=1; i<marker; i++) {
#pragma omp parallel for private(k)
for (j=1; j<marker; j++) {
for (k=1; k<marker; k++){
new[i*DIM*DIM+j*DIM+k]=0;
for (u=-1; u<=1; u++) {
for (v=-1; v<=1; v++) {
for (w=-1; w<=1; w++) {
new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
}
}
}
new[i*DIM*DIM+j*DIM+k]/=27;
}
}
}
#pragma omp parallel for private(i, j)
for (i=1; i<marker; i++) {
//#pragma omp parallel for private(k)
for (j=1; j<marker; j++) {
for (k=1; k<marker; k++) {
u=(new[i*DIM*DIM+j*DIM+k]/100);
if (u<=0) u=0;
if (u>=9) u=9;
histogrammy[u]++;
}
}
}
}
Upvotes: 2
Views: 2109
Reputation: 46
I achieved 12x clock times with the following answer. You can get even finer optimization by converting all three large loops into one single 3-tiered loop and setting up parallel omp pragmas inside the loop to control the flow.
void work_it_par(long *old, long *new) {
int i, j, k;
int i1,j1,k1;
int u, v, w;
long compute_it;
long aggregate=1.0;
int N = DIM-1;
int gimme = gimmie_the_func();
int need = we_need_the_func();
# pragma omp parallel for private(i, j, k, compute_it) reduction(+:aggregate) //reduce this part
for (i=1; i<N; i++)
{
for (j=1; j<N; j++)
{
for (k=1; k<N; k++)
{
compute_it = old[i*DIM*DIM+j*DIM+k] * need; ///removed the multiplications and divisions
aggregate += compute_it / gimme;
}
}
}
//aggregate *= need;
//aggregate /= gimme;
printf("AGGR:%ld\n",aggregate);
int increment = 0;
#pragma omp parallel for private(i,j,k,i1,j1,k1) reduction(+:increment)
for (i=1; i<N; i+=30)
{
for (j=1; j<N; j+=30)
{
for (k=1; k<N; k+=30)
{
for (i1 =i ; i1 < i+30 && i1 < N ; i1++)
{
for (j1 =i ; j1 < i+30 && j1 < N; j1++)
{
for (k1 =i ; k1 < i+30 && k1 < N ; k1++)
{
int index = i1*DIM*DIM+j1*DIM+k1;
int D = DIM;
int DSq = DIM*DIM;
increment = 0;
increment+=old[index-DSq-D-1]; //-1,-1
increment+=old[index-DSq-D];
increment+=old[index-DSq-D+1];
increment+=old[index-DSq-1]; //-1,0,
increment+=old[index-DSq];
increment+=old[index-DSq+1];
increment+=old[index-DSq+D-1]; //-1,1
increment+=old[index-DSq+D];
increment+=old[index-DSq+D+1];
increment+=old[index-D-1]; //0,-1
increment+=old[index-D];
increment+=old[index-D+1];
increment+=old[index-1]; //0,0
increment+=old[index];
increment+=old[index+1];
increment+=old[index+D-1]; //0,1
increment+=old[index+D];
increment+=old[index+D+1];
increment+=old[index+DSq-D-1]; //1,-1
increment+=old[index+DSq-D];
increment+=old[index+DSq-D+1];
increment+=old[index+DSq-1]; //1,0
increment+=old[index+DSq];
increment+=old[index+DSq+1];
increment+=old[index+DSq+D-1]; //1,1
increment+=old[index+DSq+D];
increment+=old[index+DSq+D+1];
new[index] = increment;
new[index]/=27;
}
}
}
}
}
}
int count0,count1,count2,count3,count4,count5,count6,count7,count8,count9 = 0;
#pragma omp parallel for private (i,j,k) reduction(+:count0,count1,count2,count3,count4,count5,count6,count7,count8,count9)
for (i=1; i<N; i++)
{
for (j=1; j<N; j++)
{
for (k=1; k<N; k++)
{
u=(new[i*DIM*DIM+j*DIM+k]/100);
if (u<=0) u = 0;
if (u>=9) u = 9;
switch (u) {
case 0:
count0 ++;
break;
case 1:
count1 ++;
break;
case 2:
count2 ++;
break;
case 3:
count3 ++;
break;
case 4:
count4 ++;
break;
case 5:
count5 ++;
break;
case 6:
count6 ++;
break;
case 7:
count7 ++;
break;
case 8:
count8 ++;
break;
case 9:
count9 ++;
break;
}
}
}
}
histogrammy[0] += count0;
histogrammy[1] += count1;
histogrammy[2] += count2;
histogrammy[3] += count3;
histogrammy[4] += count4;
histogrammy[5] += count5;
histogrammy[6] += count6;
histogrammy[7] += count7;
histogrammy[8] += count8;
histogrammy[9] += count9;
}
Upvotes: 1
Reputation: 11537
In general, you must be careful and consider making a variable private when there is an assignment on the var. Otherwise, there is risk of races between threads that will lead to randomly incorrect behavior.
This happens in several situations in your code.
compute_it = ... (already a private var)
agregate+= ... (special case that require a reduction)
u = ... (should be private)
histogram[u]+= ... (again a reduction problem)
Assignments to array elements can also be a problem, but it depends on the indexes. If indexes are thread dependent and different in every thread, then it is generally correct, except for false sharing situations. This is what happens in most you assignments to arrays. For instance for new[i*DIM*DIM+j*DIM+k]=...
, all threads will have a different i (thanks to the parallel for), different parts of the array will be concerned and there is not specific parallel problem.
For the assignment to histogrammy[u]
, the situation is different, as u is data dependent and can be identical in different threads. It can be managed with a reduction in newer omp versions, but otherwise a local accumulation must be done and at the end of the thread the global array updated in a properly protected region.
Here is a reworked version of your code (untested, as you did not deliver a working example). I also added some comments and modifications unrelated to parallelization. Check the comment with a triple ///
void work_it_par(long *old, long *new) {
int i, j, k;
int u, v, w;
long compute_it;
long aggregate=1.0; //// really?
//// I am really surprised that you use a long.
//// A double seems more appropriate
long weNeedTheFunc = we_need_the_func();
long gimmieTheFunc = gimmie_the_func();
int marker = DIM-1;
/// #pragma omp parallel for private(i, j, k, compute_it)
# pragma omp parallel for private(i, j, k, compute_it) reduction(+:aggregate)
/// introduced a reduction on aggregate
for (i=1; i<marker; i++) {
for (j=1; j<marker; j++) {
for (k=1; k<marker; k++) {
compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;
/// aggregate+= compute_it / gimmieTheFunc; /// race on shared var aggregate
/// solved by the reduction
aggregate += compute_it ; /// Unrelated to parallel processing,
/// but do not do a division in your loop
/// divisions are *expensive* and
/// denominator is always the same
}
}
}
aggregate /= gimmieTheFunc ; /// now we do the division, but just once
printf("AGGR:%ld\n",aggregate);
//#pragma omp parallel for private(i, j, u, v)
for (i=1; i<marker; i++) {
#pragma omp parallel for private(k)
for (j=1; j<marker; j++) {
for (k=1; k<marker; k++){
new[i*DIM*DIM+j*DIM+k]=0;
for (u=-1; u<=1; u++) {
for (v=-1; v<=1; v++) {
for (w=-1; w<=1; w++) {
new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
}
}
}
new[i*DIM*DIM+j*DIM+k]/=27;
}
}
}
///#pragma omp parallel for private(i, j)
#pragma omp parallel private(i, j, u) /// parallel region
{
int private_histogrammy[10]; /// used to accumulate in the threads
for (int ii=0; ii<10; ii++) private_histogrammy[ii]=0;
# pragma omp for /// a parallel for loop in the parallel region
for (i=1; i<marker; i++) {
for (j=1; j<marker; j++) {
for (k=1; k<marker; k++) {
/// u=(new[i*DIM*DIM+j*DIM+k]/100);
u=(new[i*DIM*DIM+j*DIM+k]); /// to reduce number of divisions
/// if (u<=0) u=0;
/// if (u>=9) u=9;
/// histogrammy[u]++;
if (u<=0) private_histogrammy[0]++;
else if (u>=900) private_histogrammy[9]++;
else private_histogrammy[u/100]++;
}
}
}
/// all is done update the global histogrammy
# pragma omp critical
/// update the shared array
/// probably faster with a critical section that updates globally
/// the (small) array than using atomic on array elements
/// but alternatives may be tested
for (int uu=0; uu<10; uu++) histogrammy[uu] += private_histogrammy[uu];
} /// end of parallel region
}
Upvotes: 1
Reputation: 22670
First and foremost, your code is wrong in many places. I counted 7 race conditions on a first glance.
I suggest to use the following general rules:
Declare variables as local as possible. This is easier to get right than trying to figure out which variable needs to be private. It can also help to declare variables as const to see that they can safely be shared.
If you sum up a variable in a parallel loop, use a reduction clause.
Applying these principles to the first loop looks like this:
#pragma omp parallel for reduction(+:aggregate)
for (int i=1; i<marker; i++) {
for (int j=1; j<marker; j++) {
for (int k=1; k<marker; k++) {
long compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;
aggregate+= compute_it / gimmieTheFunc;
}
}
}
For the histogram, you can also use reduction(+:histogrammy[:10])
since OpenMP 4.5, or #pragma omp atomic update
before the increment operation. Which one is better depends on the size - array reduction has a per-thread memory cost, atomic
update has a contention penalty.
Typically, parallelize the outermost loop where it is safe to do so. For nested loops, it can be beneficial to apply a collapse
clause, which includes multiple loops in the worksharing. Whether that is helps, depends on the number of threads, loop size, and balance. Typically it doesn't hurt.
e.g.
#pragma omp parallel for collapse(3)
for (int i=1; i < marker; i++) {
for (int j=1; j < marker; j++) {
for (int k=1; k < marker; k++) {
If you are done with making sure the code is correct and you want to look at performance please consider the following: Use performance analysis tools that know OpenMP / threads. If you want to discuss actual performance on StackOverflow, you must
Upvotes: 3