Reputation: 133
I'm wondering what intrinsics make the SIMD slower than normal matrix multiplication and what should I do to make the multiplication of large matrix faster using SIMD. Here we have matrixA[8][8]
, matrixB[8][8]
and result matrixC[8][8]
. Because the maximum number of elements for float32_t is 4, so I did 2 vmul and vadd, which seem to be quite not optimized. I work on ARMv7-A Cortex A8.
void matrix_mult_neon (void)
{
int i;
float32x4x2_t vectB1, vectB2, vectB3, vectB4, vectB5, vectB6, vectB7, vectB8;
vectB1 = vld2q_f32(matrixB[0]);
vectB2 = vld2q_f32(matrixB[1]);
vectB3 = vld2q_f32(matrixB[2]);
vectB4 = vld2q_f32(matrixB[3]);
vectB5 = vld2q_f32(matrixB[4]);
vectB6 = vld2q_f32(matrixB[5]);
vectB7 = vld2q_f32(matrixB[6]);
vectB8 = vld2q_f32(matrixB[7]);
float32x4x2_t vectT1, vectT2, vectT3, vectT4, vectT5, vectT6, vectT7, vectT8;
for (i = 0; i < 8; i++)
{
vectT1.val[0] = vmulq_n_f32(vectB1.val[0], matrixA[i][0]);
vectT1.val[1] = vmulq_n_f32(vectB1.val[1], matrixA[i][0]);
vectT2.val[0] = vmulq_n_f32(vectB2.val[0], matrixA[i][1]);
vectT2.val[1] = vmulq_n_f32(vectB2.val[1], matrixA[i][1]);
vectT3.val[0] = vmulq_n_f32(vectB3.val[0], matrixA[i][2]);
vectT3.val[1] = vmulq_n_f32(vectB3.val[1], matrixA[i][2]);
vectT4.val[0] = vmulq_n_f32(vectB4.val[0], matrixA[i][3]);
vectT4.val[1] = vmulq_n_f32(vectB4.val[1], matrixA[i][3]);
vectT5.val[0] = vmulq_n_f32(vectB5.val[0], matrixA[i][4]);
vectT5.val[1] = vmulq_n_f32(vectB5.val[1], matrixA[i][4]);
vectT6.val[0] = vmulq_n_f32(vectB6.val[0], matrixA[i][5]);
vectT6.val[1] = vmulq_n_f32(vectB6.val[1], matrixA[i][5]);
vectT7.val[0] = vmulq_n_f32(vectB7.val[0], matrixA[i][6]);
vectT7.val[1] = vmulq_n_f32(vectB7.val[1], matrixA[i][6]);
vectT8.val[0] = vmulq_n_f32(vectB8.val[0], matrixA[i][7]);
vectT8.val[1] = vmulq_n_f32(vectB8.val[1], matrixA[i][7]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT2.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT3.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT4.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT5.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT6.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT7.val[0]);
vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT8.val[0]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT2.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT3.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT4.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT5.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT6.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT7.val[1]);
vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT8.val[1]);
vst2q_f32(matrixC_neon[i], vectT1);
}
}
My normal matrix multiplication function:
void matrix_mult (void)
{
float tempProduct;
int i, j, k;
for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
tempProduct = 0;
for (k = 0; k < 8; k++)
{
tempProduct = tempProduct + matrixA[i][k] * matrixB[k][j];
}
matrixC[i][j] = tempProduct;
}
}
}
I use gettimeofday()
function in the library <sys/time.h>
to calculate time in nanoseconds.
Upvotes: 3
Views: 2423
Reputation: 6354
The Problem:
aarch32
has a NEON register bank of the size 256bytes totalVFP
. And VFP
on the Cortex-A8
particularly is unbelievably slow, in addtion to the NEON
<->VFP
switching overhead. Unlike auto-vectorization, intrinsic do pretty much everything the way you tell it to do. And you gave the wrong instruction.The Solution:
We transpose matrix B and do dot-product math line by line.
I hope the code below works for you, and if performance is crucial, consider writing in assembly since compilers aren't very trustworthy when it comes to NEON performance, even in intrinsics.
static __always_inline float32x2_t dotProduct(float32x4x2_t input1, float32x4x2_t input2)
{
float32x2_t d0, d1;
float32x4_t q0;
input1.val[0] = vmulq_f32(input1.val[0], input2.val[0]);
input1.val[1] = vmulq_f32(input1.val[1], input2.val[1]);
q0 = vaddq_f32(input1.val[0], input1.val[1]);
d0 = vget_low_f32(q0);
d1 = vget_high_f32(q0);
d0 = vpadd_f32(d0, d1);
d0 = vpadd_f32(d0, d1);
return d0;
}
void matMulF_neon(float *pDst, float *pMatA, float *pMatB)
{
float32x4x4_t line01, line23, line45, line67;
float32x4x2_t b[8], *pA, *pB, temp;
float32x2x4_t result;
uint32_t i;
// vld4 for easier transpose
line01 = vld4q_f32(pMatB++);
line23 = vld4q_f32(pMatB++);
line45 = vld4q_f32(pMatB++);
line67 = vld4q_f32(pMatB);
// transpose MatB
vuzpq_f32(line01.val[0], line45.val[0]);
vuzpq_f32(line01.val[1], line45.val[1]);
vuzpq_f32(line01.val[2], line45.val[2]);
vuzpq_f32(line01.val[3], line45.val[3]);
vuzpq_f32(line23.val[0], line67.val[0]);
vuzpq_f32(line23.val[1], line67.val[1]);
vuzpq_f32(line23.val[2], line67.val[2]);
vuzpq_f32(line23.val[3], line67.val[3]);
// store MatB to stack
b[0].val[0] = line01.val[0];
b[0].val[1] = line01.val[1];
b[1].val[0] = line01.val[2];
b[1].val[1] = line01.val[3];
b[2].val[0] = line23.val[0];
b[2].val[1] = line23.val[1];
b[3].val[0] = line23.val[2];
b[3].val[1] = line23.val[3];
b[4].val[0] = line45.val[0];
b[4].val[1] = line45.val[1];
b[5].val[0] = line45.val[2];
b[5].val[1] = line45.val[3];
b[6].val[0] = line67.val[0];
b[6].val[1] = line67.val[1];
b[7].val[0] = line67.val[2];
b[7].val[1] = line67.val[3];
pA = (float32x4x2_t *) pMatA;
i = 8;
do
{
// just the right amount of data for aarch32 NEON register bank size
pB = b;
temp = *pA++;
result.val[0] = dotProduct(*pB++, temp);
result.val[1] = dotProduct(*pB++, temp);
result.val[2] = dotProduct(*pB++, temp);
result.val[3] = dotProduct(*pB++, temp);
vst4_lane_f32(pDst++, result, 0);
result.val[0] = dotProduct(*pB++, temp);
result.val[1] = dotProduct(*pB++, temp);
result.val[2] = dotProduct(*pB++, temp);
result.val[3] = dotProduct(*pB, temp);
vst4_lane_f32(pDst++, result, 0);
} while (--i);
}
/////////////////////////// EDIT
I checked the disassembly and the generated code is FUBAR. (Linaro GCC 7.1.1)
I'd go the assembly route. Writing NEON codes in intrinsics is pure waste of time IMO.
Upvotes: 5