Reputation: 313
I want to apply SSE instructions to a vector containing complex numbers. Without SSE instructions, I can do it with the following code. However, when I apply SSE instructions, I don't know how to get the calculated real and imaginary part back to the array. How can I solve this?
complex double * complexScaling(complex double *input, double c, int length)
{
for(int i=0; i<length; i++) {
input[i] = c*input[i];
}
return input;
}
complex double * complexScalingSSE(complex double *input, double c, int length)
{
__m128d multiplier,real,imag;
multiplier = _mm_set1_pd(c);
for(int i=0; i<length; i+=2) {
real = _mm_set_pd(creal(input[i]),creal(input[i+1]));
real = _mm_mul_pd(real, multiplier);
imag = _mm_set_pd(cimag(input[i]),cimag(input[i+1]));
imag = _mm_mul_pd(imag, multiplier);
}
return input;
}
Upvotes: 0
Views: 449
Reputation: 213120
You probably want something like this:
complex double * complexScalingSSE(complex double *input, double c, int length)
{
const __m128d vc = _mm_set1_pd(c);
for (int i = 0; i < length; i++)
{
__m128d v = _mm_loadu_pd((double *)&input[i]); // load one complex double
v = _mm_mul_pd(v, vc); // scale it
_mm_storeu_pd((double *)&input[i], v); // store it
}
return input;
}
You might want to try loop unrolling by a factor of 2 (but note that for large vectors this routine may well be I/O bound):
complex double * complexScalingSSE(complex double *input, double c, int length)
{
const __m128d vc = _mm_set1_pd(c);
int i;
for (i = 0; i <= length - 2; i += 2)
{
__m128d v1 = _mm_loadu_pd((double *)&input[i]); // load two complex doubles
__m128d v2 = _mm_loadu_pd((double *)&input[i + 1]);
v1 = _mm_mul_pd(v1, vc); // scale them
v2 = _mm_mul_pd(v2, vc);
_mm_storeu_pd((double *)&input[i], v1); // store them
_mm_storeu_pd((double *)&input[i + 1], v2);
}
if (i < length) // handle odd element at end
{
__m128d v = _mm_loadu_pd((double *)&input[i]); // load one complex double
v = _mm_mul_pd(v, vc); // scale it
_mm_storeu_pd((double *)&input[i], v); // store it
}
return input;
}
Note also that a decent compiler should auto-vectorize your original routine, in which case you won't see any benefit from hand-coded SSE.
Upvotes: 3