Reputation: 9
I am struggling with a lack of performance on the usage of a direct lookup table with equidistant inputs, for a 2D interpolation.
The goal is to find the 4 values (z00,z01,z11,z10) in the table z(x,y) corresponding to the two closest values of each of the inputs x and y, (x0 < x < x1) and (y0 < y < y1).
For example, the following lookup:
x
y | 1 | 2 | 3 |
---|---|---|---|
4 | 20 | 23 | 29 |
6 | 35 | 37 | 43 |
8 | 47 | 50 | 55 |
Is represented by the following array:
const f32 lookup {20,35,47,23,37,50,29,43,55}
Additionally together with the definition of the array a structure provides the following data to allow a more efficient lookup:
lowest_x = 1;
lowest_y = 4;
step_x = 1;
step_y = 2;
length_x = 3;
length_y = 3;
The most time consuming part of the algorithm is in finding the indices corresponding to the intersection of the values before and after the input x and y.
My current approach is to calculate them as follows:
Given that x0 and y0 are in indices multiple of:
index_x0 = u32 ((x-lowest_x)/step_x);
index_y0 = u32 ((y-lowest_y)/step_y);
Then x0,x1,y0 and y1 are:
x0 = lowest_x + index_x0 * step_x ;
x1 = x0 + step_x ;
y0 = lowest_y + index_y0 * step_y ;
y1 = y0 + step_y ;
And the 4 necessary values from the lookup z(x,y) are:
z00_index = index_x0*length_y0+index_y0;
z00= lookup[z00_index]
z01= lookup[z00_index+1]
z10= lookup[z00_index+length_y0];
z11= lookup[z00_index+length_y0+1];
The 2D interpolation is then performed as two interpolations of x along y0 and y1 and one interpolation of y:
zxy0 = (x1-x)/(x1-x0)*z00 + (x-x0)/(x1-x0)*z10;
zxy1 = (x1-x)/(x1-x0)*z01 + (x-x0)/(x1-x0)*z11;
z = (y1-y)/(y1-y0)*zxy0 + (y-y0)/(y-y0)*zxy1;
Any suggestions on how to improve this?
Upvotes: -1
Views: 1091
Reputation: 29126
I'm not on embedded, but there are some opportunities for microoptimization, especially reducing the number of multiplications and divisions and avoing to recalculate the same stuff again.
If you do the calculation of the index to a real number first:
double ix = (x - x.base) / x.step;
you will get a "real index", which you can convert to an integer:
unsigned ix0 = ix;
Now the difference between these give you interpolation indices:
double r1 = ix - ix0; // == (x - x0) / (x1 - x0)
double r0 = 1.0 - r1; // == (x1 - x) / (x1 - x0)
You now have all you need for x
: the lookup index and the interpolation coefficients. What you don't need is the actual x
values at the beginning and end of the interval. Do the same with y
to get iy0
, r0
and r1
and your interpolated value is:
double z = r0 * (s0 * z00 + s1 * z01) + r1 * (s0 * z10 + s1 * z11);
That's one division per coordinate, which you can turn into a multiplication by precalculating the factors 1.0 / x.step
and 1.0 / y.step
.
Here's a full example implementation of your original code and the suggested improvements. Tested on a PC, where I got a 30%-40% speedup, but not tested on embedded. Careful: The code doesn't do any bounds checking!
#include <stdlib.h>
#include <stdio.h>
typedef struct Lookup Lookup;
typedef struct Range Range;
struct Range {
double base;
double step;
unsigned length;
};
struct Lookup {
Range x, y;
double *data;
double xdenom;
double ydenom;
};
double lookup1(const Lookup *lo, double x, double y)
{
unsigned index_x0 = (x - lo->x.base) / lo->x.step;
unsigned index_y0 = (y - lo->y.base) / lo->y.step;
double x0 = lo->x.base + index_x0 * lo->x.step;
double x1 = x0 + lo->x.step;
double y0 = lo->y.base + index_y0 * lo->y.step;
double y1 = y0 + lo->y.step;
double *p = lo->data + index_x0 * lo->y.length + index_y0;
double z00 = p[0];
double z01 = p[1];
double z10 = p[lo->y.length];
double z11 = p[lo->y.length + 1];
double zxy0 = (x1 - x) / (x1 - x0)*z00 + (x - x0) / (x1 - x0)*z10;
double zxy1 = (x1 - x) / (x1 - x0)*z01 + (x - x0) / (x1 - x0)*z11;
double z = (y1 - y) / (y1 - y0)*zxy0 + (y - y0) / (y1 - y0)*zxy1;
return z;
}
double lookup2(const Lookup *lo, double x, double y)
{
double ix = (x - lo->x.base) * lo->xdenom;
double iy = (y - lo->y.base) * lo->ydenom;
unsigned ix0 = ix;
unsigned iy0 = iy;
double r1 = ix - ix0;
double r0 = 1.0 - r1;
double s1 = iy - iy0;
double s0 = 1.0 - s1;
double *p = lo->data + (ix0 * lo->y.length + iy0);
double z00 = p[0];
double z01 = p[1];
double z10 = p[lo->y.length];
double z11 = p[lo->y.length + 1];
double z = r0 * (s0 * z00 + s1 * z01)
+ r1 * (s0 * z10 + s1 * z11);
return z;
}
double urand(void)
{
return rand() / (1.0 + RAND_MAX);
}
enum {
L = 1 << 24
};
int main(void)
{
Lookup lo = {
{10, 0.1, 80},
{10, 0.1, 80},
};
unsigned i, j;
double *p;
unsigned l = L;
double sum = 0;
lo.xdenom = 1.0 / lo.x.step;
lo.ydenom = 1.0 / lo.y.step;
p = lo.data = malloc(lo.x.length * lo.y.length * sizeof(*lo.data));
for (i = 0; i < lo.x.length; i++) {
for (j = 0; j < lo.y.length; j++) {
*p++ = urand();
}
}
while (l--) {
double x = lo.x.base + (lo.x.length * lo.x.step) * urand();
double y = lo.y.base + (lo.y.length * lo.y.step) * urand();
double z = lookup2(&lo, x, y);
sum += z;
}
printf("%g\n", sum / L);
free(lo.data);
return 0;
}
Upvotes: 1