Reputation: 2321
I am trying to optimize the nested for loop in the function generate_histogram() below with openMP. I have tried around a lot with different combinations of pragmas based on what I've read in this SE post.
The problem is that the nested for loop performs faster without openMP than with openMP!
If I try to parallelize my code with reduction instead of the atomic pragma, I end up with netchunk fails. Does anybody know a fancy tweak for this one? I am trying to bin data into a histogram. So the histogram is variable in size in the real code, unlike in the snippet below.
#include<stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define float_t float
#include <time.h>
#include <omp.h>
float_t generate_histogram(float_t **matrix, int *histogram, int mat_size, int hist_size)
{
int i,j,k,count;
float_t max = 0.;
float_t sum;
//set histogram to zero everywhere
for(i = 0; i < hist_size; i++)
histogram[i] = 0;
//matrix computations
#pragma omp parallel for private(i) shared(histogram,j,k,max) schedule(dynamic)
//#pragma omp parallel for schedule(runtime)
for (i = 1; i < (mat_size-1); i++)
{
#pragma omp parallel for private(j,k) shared(histogram,max) schedule(dynamic)
//pragma omp prallel for schedule(dynamic)
for(j = 1; j < (mat_size-1); j++)
{
//assign current matrix[i][j] to element in order to reduce memory access
sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+ fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
//compute index of histogram bin
k = (int)(sum * (float)mat_size);
#pragma omp atomic
histogram[k] += 1;
//keep track of largest element
if(sum > max)
max = sum;
}//end inner for
}//end outer for
return max;
}
main()
{
int i,j,N,boxes;
N = 10000;
float_t **matrix;
int* histogram;
boxes = N / 2;
//allocate a matrix with some numbers
matrix = calloc(N, sizeof(float_t **));
for(i = 0; i < N; i++)
matrix[i] = calloc(N, sizeof(float_t *));
for(i = 0; i < N; i++)
for(j = 0; j < N; j++)
matrix[i][j] = 1./(float_t) N * (float_t) i;
histogram = malloc(boxes * sizeof(int));
generate_histogram(matrix, histogram, N, boxes);
}
Upvotes: 8
Views: 21265
Reputation:
This is an interesting problem. I fixed your code. @KunHuang had the right idea but you have several more problems with private and shared variables.
Your old function is called generate_histogram
in which I commented out the omp stuff. The new one which uses OpenMP is called generate_histogram_omp
.
The old code finishes in time 0.67 seconds on my system (ivy bridge dual core) and the new code finishes in 0.32 seconds.
Also, I tried fusing your loop but it made the performance much worse (probably a cache issue) so I only parallelize the first loop and I still get a 2x speed up on two cores with the current code anyway. I left the fused code commented out if you want to play with it.
Lastly, your initial values of the matrix don't really fill out the histogram much i.e. only a few bins are being filled.
I compiled with
g++ hist.cpp -o hist -fopenmp -O3
The code:
#include<stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define float_t float
#include <time.h>
#include <omp.h>
float_t generate_histogram(float_t **matrix, int *histogram, int mat_size, int hist_size)
{
int i,j,k,count;
float_t max = 0.;
float_t sum;
//set histogram to zero everywhere
for(i = 0; i < hist_size; i++)
histogram[i] = 0;
//matrix computations
//#pragma omp parallel for schedule(runtime)
for (i = 1; i < (mat_size-1); i++)
{
//pragma omp prallel for schedule(dynamic)
for(j = 1; j < (mat_size-1); j++)
{
//assign current matrix[i][j] to element in order to reduce memory access
sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+ fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
//compute index of histogram bin
k = (int)(sum * (float)mat_size);
histogram[k] += 1;
//keep track of largest element
if(sum > max)
max = sum;
}//end inner for
}//end outer for
return max;
}
float_t generate_histogram_omp(float_t **matrix, int *histogram, int mat_size, int hist_size) {
float_t max = 0.;
//set histogram to zero everywhere
int i;
for(i = 0; i < hist_size; i++)
histogram[i] = 0;
//matrix computations
#pragma omp parallel
{
int *histogram_private = (int*)malloc(hist_size * sizeof(int));
int i;
for(i = 0; i < hist_size; i++)
histogram_private[i] = 0;
float_t max_private = 0.;
int n;
int j;
#pragma omp for
for (i = 1; i < (mat_size-1); i++) {
for(j = 1; j < (mat_size-1); j++) {
// for (n=0; n < (mat_size-2)*(mat_size-2); n++) {
// int i = n/(mat_size-2)+1;
// int j = n%(mat_size-2)+1;
float_t sum = fabs(matrix[i][j]-matrix[i-1][j]) + fabs(matrix[i][j] - matrix[i+1][j])
+ fabs(matrix[i][j]-matrix[i][j-1]) + fabs(matrix[i][j] - matrix[i][j+1]);
//compute index of histogram bin
int k = (int)(sum * (float)mat_size);
histogram_private[k] += 1;
//keep track of largest element
if(sum > max_private)
max_private = sum;
}
}
#pragma omp critical
{
for(i = 0; i < hist_size; i++)
histogram[i] += histogram_private[i];
if(max_private>max)
max = max_private;
}
free(histogram_private);
}
return max;
}
int compare_hists(int *hist1, int *hist2, int N) {
int i;
int diff = 0;
for(i =0; i < N; i++) {
int tmp = hist1[i] - hist2[i];
diff += tmp;
if(tmp!=0) {
printf("i %d, hist1 %d, hist2 %d\n", i, hist1[i], hist2[i]);
}
}
return diff;
}
main() {
int i,j,N,boxes;
N = 10000;
float_t **matrix;
int* histogram1;
int* histogram2;
boxes = N / 2;
//allocate a matrix with some numbers
matrix = (float_t**)calloc(N, sizeof(float_t **));
for(i = 0; i < N; i++)
matrix[i] = (float_t*)calloc(N, sizeof(float_t *));
for(i = 0; i < N; i++)
for(j = 0; j < N; j++)
matrix[i][j] = 1./(float_t) N * (float_t) i;
histogram1 = (int*)malloc(boxes * sizeof(int));
histogram2 = (int*)malloc(boxes * sizeof(int));
for(i = 0; i<boxes; i++) {
histogram1[i] = 0;
histogram2[i] = 0;
}
double dtime;
dtime = omp_get_wtime();
generate_histogram(matrix, histogram1, N, boxes);
dtime = omp_get_wtime() - dtime;
printf("time %f\n", dtime);
dtime = omp_get_wtime();
generate_histogram_omp(matrix, histogram2, N, boxes);
dtime = omp_get_wtime() - dtime;
printf("time %f\n", dtime);
int diff = compare_hists(histogram1, histogram2, boxes);
printf("diff %d\n", diff);
}
Upvotes: 10
Reputation: 787
It is not possible to reduce an array or an struct in OpenMP, which is mentioned here: https://computing.llnl.gov/tutorials/openMP/#REDUCTION.
I think you can declare multiple copies of histogram
, each of which is used in one thread. After then use another OpenMP loop to add them up.
Upvotes: 2