Reputation: 20037
I'm having a function to be called through function pointer, which uses all the 32 Arm Neon registers.
The calling convention requires to spill and restore the bottom part of the registers d8-d15, however, I like to get rid of that for instance by delegating the burden to the higher level function.
Example:
#include "arm_neon.h"
inline void add(float32x4x4_t &a, float32x4x4_t b) {
a.val[0] = vaddq_f32(a.val[0], b.val[0]);
a.val[1] = vaddq_f32(a.val[1], b.val[1]);
a.val[2] = vaddq_f32(a.val[2], b.val[2]);
a.val[3] = vaddq_f32(a.val[3], b.val[3]);
}
float32x4x4_t foo(float *input, int n) {
auto a = vld4q_f32(input); input += 16;
auto b = vld4q_f32(input); input += 16;
auto c = vld4q_f32(input); input += 16;
auto d = vld4q_f32(input); input += 16;
do {
add(a, b);
add(b, c);
add(c, d);
add(d, d);
} while (--n);
add(a,b);
add(a,c);
add(a,d);
return a;
}
...
foo(float*, int): // @foo(float*, int)
stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
stp d9, d8, [sp, #16] // 16-byte Folded Spill
mov x9, x0
At least I can force the caller of the Neon intensive function to spill and restore all the callee saved registers, but is there a way to convince the non-inlined function that this has been already taken care of?
float32x4x4_t bar(float *input, int n) {
foo(input, n);
asm("" : : : "d8","d9","d10","d11","d12","d13","d14","d15");
}
...
bar(float*, int): // @bar(float*, int)
stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
stp d13, d12, [sp, #16] // 16-byte Folded Spill
stp d11, d10, [sp, #32] // 16-byte Folded Spill
stp d9, d8, [sp, #48] // 16-byte Folded Spill
mov x8, x0
Upvotes: 1
Views: 66