Reputation: 361
I have the following method called pgain which calls the method dist that I am trying to parallize:
/******************************************************************************/
/* For a given point x, find the cost of the following operation:
* -- open a facility at x if there isn't already one there,
* -- for points y such that the assignment distance of y exceeds dist(y, x),
* make y a member of x,
* -- for facilities y such that reassigning y and all its members to x
* would save cost, realize this closing and reassignment.
*
* If the cost of this operation is negative (i.e., if this entire operation
* saves cost), perform this operation and return the amount of cost saved;
* otherwise, do nothing.
*/
/* numcenters will be updated to reflect the new number of centers */
/* z is the facility cost, x is the number of this point in the array
points */
double pgain ( long x, Points *points, double z, long int *numcenters )
{
int i;
int number_of_centers_to_close = 0;
static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;
int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof ( double );
if ( stride % cl != 0 ) {
stride = cl * ( stride / cl + 1 );
}
int K = stride - 2 ; // K==*numcenters
//my own cost of opening x
double cost_of_opening_x = 0;
work_mem = ( double* ) malloc ( 2 * stride * sizeof ( double ) );
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;
/*
* For each center, we have a *lower* field that indicates
* how much we will save by closing the center.
*/
int count = 0;
for ( int i = 0; i < points->num; i++ ) {
if ( is_center[i] ) {
center_table[i] = count++;
}
}
work_mem[0] = 0;
//now we finish building the table. clear the working memory.
memset ( switch_membership, 0, points->num * sizeof ( bool ) );
memset ( work_mem, 0, stride*sizeof ( double ) );
memset ( work_mem+stride,0,stride*sizeof ( double ) );
//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];
#pragma omp parallel for
for ( i = 0; i < points->num; i++ ) {
float x_cost = dist ( points->p[i], points->p[x], points->dim ) * points->p[i].weight;
float current_cost = points->p[i].cost;
if ( x_cost < current_cost ) {
// point i would save cost just by switching to x
// (note that i cannot be a median,
// or else dist(p[i], p[x]) would be 0)
switch_membership[i] = 1;
cost_of_opening_x += x_cost - current_cost;
} else {
// cost of assigning i to x is at least current assignment cost of i
// consider the savings that i's **current** median would realize
// if we reassigned that median and all its members to x;
// note we've already accounted for the fact that the median
// would save z by closing; now we have to subtract from the savings
// the extra cost of reassigning that median and its members
int assign = points->p[i].assign;
lower[center_table[assign]] += current_cost - x_cost;
}
}
// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for ( int i = 0; i < points->num; i++ ) {
if ( is_center[i] ) {
double low = z + work_mem[center_table[i]];
gl_lower[center_table[i]] = low;
if ( low > 0 ) {
// i is a median, and
// if we were to open x (which we still may not) we'd close i
// note, we'll ignore the following quantity unless we do open x
++number_of_centers_to_close;
cost_of_opening_x -= low;
}
}
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;
gl_number_of_centers_to_close = ( int ) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];
// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing
if ( gl_cost_of_opening_x < 0 ) {
// we'd save money by opening x; we'll do it
for ( int i = 0; i < points->num; i++ ) {
bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
if ( switch_membership[i] || close_center ) {
// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
points->p[i].cost = points->p[i].weight * dist ( points->p[i], points->p[x], points->dim );
points->p[i].assign = x;
}
}
for ( int i = 0; i < points->num; i++ ) {
if ( is_center[i] && gl_lower[center_table[i]] > 0 ) {
is_center[i] = false;
}
}
if ( x >= 0 && x < points->num ) {
is_center[x] = true;
}
*numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
gl_cost_of_opening_x = 0; // the value we'll return
}
free ( work_mem );
return -gl_cost_of_opening_x;
}
The function that I am trying to parallelize:
/* compute Euclidean distance squared between two points */
float dist ( Point p1, Point p2, int dim )
{
float result=0.0;
#pragma omp parallel for reduction(+:result)
for (int i=0; i<dim; i++ ){
result += ( p1.coord[i] - p2.coord[i] ) * ( p1.coord[i] - p2.coord[i] );
}
return ( result );
}
With Point being this:
/* this structure represents a point */
/* these will be passed around to avoid copying coordinates */
typedef struct {
float weight;
float *coord;
long assign; /* number of point where this one is assigned */
float cost; /* cost of that assignment, weight*distance */
} Point;
I have a large application of streamcluster(815 lines of code) that produces real time numbers and sorts them in a specific way. I have used scalasca tool on Linux so I can measure the methods that take up most of the time and I have found that method dist listed above is the most time-consuming. I am trying to use openMP tools but the time that the parallelized code runs is more than the time the serial code. If serial code runs in 1,5 sec the parallelized takes 20 but the results are the same. And I am wondering is it that I can't parallelize this part of code for some reason or that I don't do it correctly. The method I am trying to parallelize its in a call tree: main->pkmedian->pFL->pgain->dist (-> means that calls the following method)
Upvotes: 2
Views: 925
Reputation: 61239
The code you've chosen to parallelize:
float result=0.0;
#pragma omp parallel for reduction(+:result)
for (int i=0; i<dim; i++ ){
result += ( p1.coord[i] - p2.coord[i] ) * ( p1.coord[i] - p2.coord[i] );
}
is a poor candidate to benefit from parallelization. You should not use parallel for
here. You should probably not use parallelization on an inner loop. If you can parallelize some outer loop, you're much more like to see gains.
There is an overhead to coordinate the thread team to start the parallel region and another overhead for performing the reduction afterwards. Meanwhile, the parallel region's contents take essentially no time to run. Given that, you'd need dim
to be extremely large before you'd expect this to give a performance benefit.
To express that point more graphically, consider that the math you're doing will take nanoseconds and compare it against this chart showing the overhead of various OpenMP directives.
If you need this to run faster, your first stop should be to use appropriate compilation flags, followed by looking into SIMD operations: SSE and AVX are good keywords. Your compiler might even invoke them automatically.
I've built some test code (see below) and compiled it with various optimizations enabled, as listed below, and run it on arrays of 100,000 elements. Note that enabling -O3
results in a run-time that is on the order of the OpenMP directives. This implies that you'd want arrays of about 400,000 before you'd want to think about using OpenMP and probably more like 1,000,000, to be safe.
-O3
: Enables many optimizations. Run-time is ~200μs.-ffast-math
: You want this, unless you're doing some very tricky things. Run-time is about the same.-march=native
: Compile code to use the full capabilities of your CPU, rather than a generic instruction set that would work on many CPUs. Run-time is ~100μs.So there we go, strategic use of compiler options (-march=native
) can double the speed of the code in question without having to muck about in parallelism.
Here is a handy slide presentation with some tips explaining how to use OpenMP in a performant manner.
Test code:
#include <vector>
#include <cstdlib>
#include <chrono>
#include <iostream>
int main(){
std::vector<double> a;
std::vector<double> b;
for(int i=0;i<100000;i++){
a.push_back(rand()/(double)RAND_MAX);
b.push_back(rand()/(double)RAND_MAX);
}
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
float result = 0.0;
//#pragma omp parallel for reduction(+:result)
for (unsigned int i=0; i<a.size(); i++ )
result += ( a[i] - b[i] ) * ( a[i] - b[i] );
std::chrono::steady_clock::time_point end= std::chrono::steady_clock::now();
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " microseconds"<<std::endl;
}
Upvotes: 3