Reputation: 723
I have written some serial code which I would like to optimise as much as possible before I parallelise it using OpenMP. The program reads in a PPM file by iterating through the pixel data in 4x4 cells (variable c
), then it finds the average RGB value of each of those 4x4 cells, and then finally writes to a new file by outputing the average colour value, again of each of the 4x4 cells. This creates a sort of mosaic/pixelation effect.
Having performance profiled my code, the main bottlenecks are fscanf
and fprintf
. I am ignoring execution time to read/write to disk, so these two functions do not matter.
My effort to optimise so far:
Loop jamming: There are two nested FOR loops within the code which have the exact same loop conditions. However, the second set of nested FOR loops requires that the functions needed to calculate the average RGB value are kept outside of that specific set. The average RGB value calculations are then needed for use in the third set of nested FOR loops (which have the same loop conditions as the second set). Because of this, I have struggled to combine the second and third sets of nested FOR loops despite their similarity.
Loop invariant computations: I have tried to move certain operations outside of the loop where possible, but this has proven to be difficult.
To summarise: How can I optimise this program to reduce the execution time as much as possible?
My code:
typedef struct { //struct holding RGB type int
int r, g, b; //12 bytes
} pixelInt;
typedef struct { //struct holding RGB type unsigned char
unsigned char r, g, b; //3 bytes
} pixel;
int c = 4; // Variable of 4x4 grids
int width, height; //image variable declarations
//Raw 1 dimensional store of pixel data - will contain all the data for each pixel in the image
pixel *data = (pixel *)calloc(width * height, sizeof(pixelInt));
//Loop through entire input image
for (int yy = 0; yy < height; yy += c)
{
for (int xx = 0; xx < width; xx += c)
{
//the total colour of cell of size 'c'
pixelInt cell_tot = { 0, 0, 0 }; //zero initialize struct containing mosaic cell pixel totals.
unsigned int counter = 0; //the counter for how many pixels are in a 4x4 cell
int bx = xx + c; //used in loop conditions
int by = yy + c;
// Store each color from the cell into cell_tot struct
for (int y = yy; y < by; y++)
{
for (int x = xx; x < bx; x++)
{
unsigned int index_1d = x + y * width; //calculate 1d index from x-index (x), y-index(y) and width;
unsigned char r, g, b; //maximum vales are 255, i.e. unsigned char data type
fscanf(f, "%hhu %hhu %hhu", &r, &g, &b); //%hhu is unsigned char specifier
//store the pixel value into data array
data[index_1d].r = r;
data[index_1d].g = g;
data[index_1d].b = b;
counter++; //increment counter
//average pixel color of cell
cell_tot.r += r;
cell_tot.g += g;
cell_tot.b += b;
}
}
//average colour of cell found by dividing cell total by loop counter
pixel cell_average;
cell_average.r = cell_tot.r / counter;
cell_average.g = cell_tot.g / counter;
cell_average.b = cell_tot.b / counter;
//Loop through the new image in cells of size c
for (int y = yy; y < by; y++)
{
for (int x = xx; x < bx; x++)
{
unsigned int index_1d = x + y * width; //calculate 1d index from x-index (x), y-index(y) and width;
//Assign average cell value to the pixels in the cell
data[index_1d].r = cell_average.r;
data[index_1d].g = cell_average.g;
data[index_1d].b = cell_average.b;
//Output the average colour value for the image
fprintf(f_output, "%hhu %hhu %hhu \t", data[index_1d].r, data[index_1d].g, data[index_1d].b);
}
fprintf(f_output, "\n"); //Prints new line
}
}
}
Upvotes: 2
Views: 101
Reputation: 72519
On a 1024x1024 image on my machine your code executes in 0.325s
. The following code executes in 0.182s
:
unsigned w = width/c, h = height/c;
unsigned *accum = (unsigned*)malloc(3*sizeof(unsigned)*w);
char *line = (char*)malloc(12*w);
unsigned denom = c*c;
//Loop through entire input image
for (int yy = 0; yy < h; ++yy)
{
memset(accum, 0, 3*sizeof(unsigned)*w);
// read and accumulate c lines
for(int y = 0; y < c; ++y)
{
for (int xx = 0; xx < w; ++xx)
{
for (int x = 0; x < c; ++x)
{
unsigned char r, g, b;
fscanf(f, "%hhu %hhu %hhu", &r, &g, &b);
accum[3*xx+0] += r;
accum[3*xx+1] += g;
accum[3*xx+2] += b;
}
}
}
// format a line
for(int xx = 0; xx < w; ++xx)
{
char *cell = line + 12*c*xx;
snprintf(cell, 12, "%3u%4u%4u", accum[3*xx]/denom, accum[3*xx+1]/denom, accum[3*xx+2]/denom);
cell[11] = '\t';
for(int x = 1; x < c; ++x)
memcpy(cell + 12*x, cell, 12);
}
// write it out times c
line[12*w-1] = '\n';
for(int y = 0; y < c; ++y)
fwrite(line, 12*w, 1, f_output);
}
The trick here is to format the averaged values only once and then duplicate the formatted strings. Also by acting on one row at a time I have better chances of utilizing the memory caches.
To go beyond that you would need to re-implement the fscanf
to parse the integers faster.
Upvotes: 2