Reputation: 9398
I Want to optimize the following function using SIMD (SSE2 & such):
int64_t fun(int64_t N, int size, int* p)
{
int64_t sum = 0;
for(int i=1; i<size; i++)
sum += (N/i)*p[i];
return sum;
}
This seems like an eminently vectorizable task, except that the needed instructions just aren't there ...
We can assume that N is very large (10^12 to 10^18) and size~sqrt(N). We can also assume that p can only take values of -1, 0, and 1; so we don't need a real multiplication, the (N/i)*p[i] can be done with four instructions (pcmpgt, pxor, psub, pand), if we could just somehow compute N/i.
Upvotes: 2
Views: 1229
Reputation: 54634
The derivative of 1/x
is -1/x^2
, which means as x
gets bigger, N/x==N/(x + 1)
.
For a known value of N/x
(let's call that value r
), we can determine the next value of x
(let's call that value x'
such that N/x'<r
:
x'= N/(r - 1)
And since we are dealing with integers:
x'= ceiling(N/(r - 1))
So, the loop becomes something like this:
int64_t sum = 0;
int i=1;
int r= N;
while (i<size)
{
int s= (N + r - 1 - 1)/(r - 1);
while (i<s && i<size)
{
sum += (r)*p[i];
++i;
}
r= N/s;
}
return sum;
For sufficiently large N, you will have many many runs of identical values for N/i
. Granted, you will hit a divide by zero if you aren't careful.
Upvotes: 2
Reputation: 1458
This is as close as I could get to vectorizing that code. I don't really expect it to be faster. I was just trying my hand at writting SIMD code.
#include <stdint.h>
int64_t fun(int64_t N, int size, const int* p)
{
int64_t sum = 0;
int i;
for(i=1; i<size; i++) {
sum += (N/i)*p[i];
}
return sum;
}
typedef int64_t v2sl __attribute__ ((vector_size (2*sizeof(int64_t))));
int64_t fun_simd(int64_t N, int size, const int* p)
{
int64_t sum = 0;
int i;
v2sl v_2 = { 2, 2 };
v2sl v_N = { N, N };
v2sl v_i = { 1, 2 };
union { v2sl v; int64_t a[2]; } v_sum;
v_sum.a[0] = 0;
v_sum.a[1] = 0;
for(i=1; i<size-1; i+=2) {
v2sl v_p = { p[i], p[i+1] };
v_sum.v += (v_N / v_i) * v_p;
v_i += v_2;
}
sum = v_sum.a[0] + v_sum.a[1];
for(; i<size; i++) {
sum += (N/i)*p[i];
}
return sum;
}
typedef double v2df __attribute__ ((vector_size (2*sizeof(double))));
int64_t fun_simd_double(int64_t N, int size, const int* p)
{
int64_t sum = 0;
int i;
v2df v_2 = { 2, 2 };
v2df v_N = { N, N };
v2df v_i = { 1, 2 };
union { v2df v; double a[2]; } v_sum;
v_sum.a[0] = 0;
v_sum.a[1] = 0;
for(i=1; i<size-1; i+=2) {
v2df v_p = { p[i], p[i+1] };
v_sum.v += (v_N / v_i) * v_p;
v_i += v_2;
}
sum = v_sum.a[0] + v_sum.a[1];
for(; i<size; i++) {
sum += (N/i)*p[i];
}
return sum;
}
#include <stdio.h>
static const int test_array[] = {
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0
};
#define test_array_len (sizeof(test_array)/sizeof(int))
#define big_N (1024 * 1024 * 1024)
int main(int argc, char *argv[]) {
int64_t res1;
int64_t res2;
int64_t res3;
v2sl a = { 123, 456 };
v2sl b = { 100, 200 };
union { v2sl v; int64_t a[2]; } tmp;
a = a + b;
tmp.v = a;
printf("a = { %ld, %ld }\n", tmp.a[0], tmp.a[1]);
printf("test_array size = %zd\n", test_array_len);
res1 = fun(big_N, test_array_len, test_array);
printf("fun() = %ld\n", res1);
res2 = fun_simd(big_N, test_array_len, test_array);
printf("fun_simd() = %ld\n", res2);
res3 = fun_simd_double(big_N, test_array_len, test_array);
printf("fun_simd_double() = %ld\n", res3);
return 0;
}
Upvotes: 2
Reputation: 74522
The cost is concentrated in computing the divisions. There is no opcode in SSE2 for integral divisions, so you would have to implement a division algorithm yourself, bit by bit. I do not think it would be worth the effort: SSE2 allow you to perform two instances in parallel (you use 64-bit numbers, and SSE2 registers are 128-bit) but I find it likely that a handmade division algorithm would be at least twice as slow as the CPU idiv
opcode.
(By the way, do you compile in 32-bit or 64-bit mode ? The latter will be more comfortable with 64-bit integers.)
Reducing the overall number of divisions looks like a more promising way. One may note that for positive integers x and y, then floor(x/(2y)) = floor(floor(x/y)/2). In C terminology, once you have computed N/i
(truncated division) then you just have to shift it right by one bit to obtain N/(2*i)
. Used properly, this makes half of your divisions almost free (that "properly" also includes accessing the billions of p[i]
values in a way which does not wreak havoc with the caches, so it does not seem very easy).
Upvotes: 1
Reputation: 213190
I suggest you do this with floating point SIMD operations - either single or double precision depending on your accuracy requirements. Conversion from int to float or double is relatively fast using SSE.
Upvotes: 1