vikasmk
vikasmk

Reputation: 1014

Instruction scheduling in ARM NEON

If I have a code that is doing around 30 ARM instructions followed by 20 NEON instructions . Will the NEON co-processor stall till the 30 ARM instructions are completed because of limited instruction queue ? So is it better to mix the ARM and NEON code ? To be noted: The ARM code and NEON code are independent of one another.

.
.
str sl, [sp, #36]
str fp, [sp, #84]
add r8, r8, #1  ; 0x1
lsl r9, r8, r7
sub r9, r9, #1  ; 0x1
ldr sl, [r5, r9, lsl #2]
ldr fp, [r6, r9, lsl #2]
str sl, [sp, #8]
str fp, [sp, #56]
lsl ip, r8, #1
lsl ip, ip, r7
sub ip, ip, #1  ; 0x1
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
str sl, [sp, #24]
str fp, [sp, #72]
mov r9, #512    ; 0x200
lsl ip, r8, #1
add ip, ip, r8
lsl ip, ip, r7
sub ip, ip, #1  ; 0x1
cmp ip, #512    ; 0x200
sub r9, r9, #1  ; 0x1
and ip, ip, r9
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
rsbge   sl, sl, #0  ; 0x
rsbge   fp, fp, #0  ; 0x
str sl, [sp, #40]
str fp, [sp, #88]
add r8, r8, #1  ; 0x1
lsl r9, r8, r7
sub r9, r9, #1  ; 0x1
ldr sl, [r5, r9, lsl #2]
ldr fp, [r6, r9, lsl #2]
str sl, [sp, #12]
str fp, [sp, #60]
lsl ip, r8, #1
lsl ip, ip, r7
sub ip, ip, #1  ; 0x1
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
str sl, [sp, #28]
str fp, [sp, #76]
mov r9, #512    ; 0x200
lsl ip, r8, #1
add ip, ip, r8
lsl ip, ip, r7
sub ip, ip, #1  ; 0x1
cmp ip, #512    ; 0x200
sub r9, r9, #1  ; 0x1
and ip, ip, r9
ldr sl, [r5, ip, lsl #2]
ldr fp, [r6, ip, lsl #2]
rsbge   sl, sl, #0  ; 0x
rsbge   fp, fp, #0  ; 0x
str sl, [sp, #44]
str fp, [sp, #92]
add r8, r8, #1  ; 0x1
vshr.s32    q0, q0, #2
vshr.s32    q1, q1, #2
vshr.s32    q2, q2, #2
vshr.s32    q3, q3, #2
vshr.s32    q4, q4, #2
vshr.s32    q5, q5, #2
vshr.s32    q6, q6, #2
vshr.s32    q7, q7, #2
vadd.i32    q8, q0, q4
vadd.i32    q9, q2, q6
vsub.i32    q10, q0, q4
vsub.i32    q11, q2, q6
vadd.i32    q12, q8, q9
vsub.i32    q13, q8, q9
vadd.i32    q8, q1, q5
vsub.i32    q0, q1, q5
vadd.i32    q9, q3, q7
vsub.i32    q1, q3, q7
vsub.i32    q2, q8, q9
vsub.i32    q4, q10, q1

Upvotes: 2

Views: 695

Answers (1)

You should interleave them.

You have more ARM instructions than NEON's. ARM is the dominating one executing your code.

Therefore, NEON instructions get executed for FREE within 6 entries boundary if you interleave them.

Please note that this kind of free lunch isn't available on weaker cores like the CA7.

Upvotes: 3

Related Questions