Reputation: 1014
If I have a code that is doing around 30 ARM instructions followed by 20 NEON instructions . Will the NEON co-processor stall till the 30 ARM instructions are completed because of limited instruction queue ? So is it better to mix the ARM and NEON code ? To be noted: The ARM code and NEON code are independent of one another.
.
.
str sl, [sp, #36]
str fp, [sp, #84]
add r8, r8, #1 ; 0x1
lsl r9, r8, r7
sub r9, r9, #1 ; 0x1
ldr sl, [r5, r9, lsl #2]
ldr fp, [r6, r9, lsl #2]
str sl, [sp, #8]
str fp, [sp, #56]
lsl ip, r8, #1
lsl ip, ip, r7
sub ip, ip, #1 ; 0x1
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
str sl, [sp, #24]
str fp, [sp, #72]
mov r9, #512 ; 0x200
lsl ip, r8, #1
add ip, ip, r8
lsl ip, ip, r7
sub ip, ip, #1 ; 0x1
cmp ip, #512 ; 0x200
sub r9, r9, #1 ; 0x1
and ip, ip, r9
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
rsbge sl, sl, #0 ; 0x
rsbge fp, fp, #0 ; 0x
str sl, [sp, #40]
str fp, [sp, #88]
add r8, r8, #1 ; 0x1
lsl r9, r8, r7
sub r9, r9, #1 ; 0x1
ldr sl, [r5, r9, lsl #2]
ldr fp, [r6, r9, lsl #2]
str sl, [sp, #12]
str fp, [sp, #60]
lsl ip, r8, #1
lsl ip, ip, r7
sub ip, ip, #1 ; 0x1
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
str sl, [sp, #28]
str fp, [sp, #76]
mov r9, #512 ; 0x200
lsl ip, r8, #1
add ip, ip, r8
lsl ip, ip, r7
sub ip, ip, #1 ; 0x1
cmp ip, #512 ; 0x200
sub r9, r9, #1 ; 0x1
and ip, ip, r9
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
rsbge sl, sl, #0 ; 0x
rsbge fp, fp, #0 ; 0x
str sl, [sp, #44]
str fp, [sp, #92]
add r8, r8, #1 ; 0x1
vshr.s32 q0, q0, #2
vshr.s32 q1, q1, #2
vshr.s32 q2, q2, #2
vshr.s32 q3, q3, #2
vshr.s32 q4, q4, #2
vshr.s32 q5, q5, #2
vshr.s32 q6, q6, #2
vshr.s32 q7, q7, #2
vadd.i32 q8, q0, q4
vadd.i32 q9, q2, q6
vsub.i32 q10, q0, q4
vsub.i32 q11, q2, q6
vadd.i32 q12, q8, q9
vsub.i32 q13, q8, q9
vadd.i32 q8, q1, q5
vsub.i32 q0, q1, q5
vadd.i32 q9, q3, q7
vsub.i32 q1, q3, q7
vsub.i32 q2, q8, q9
vsub.i32 q4, q10, q1
Upvotes: 2
Views: 695
Reputation: 6354
You should interleave them.
You have more ARM instructions than NEON's. ARM is the dominating one executing your code.
Therefore, NEON instructions get executed for FREE within 6 entries boundary if you interleave them.
Please note that this kind of free lunch isn't available on weaker cores like the CA7.
Upvotes: 3