Reputation: 5280
For some reasons, I need to replace memcpy's stp
instruction with str
, here is what I did:
modified sysdeps/aarch64/memcpy.S
@@ -102,11 +102,19 @@ ENTRY (MEMCPY)
tbz tmp1, 5, 1f
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
+ //stp B_l, B_h, [dstin, 16]
+ str B_l, [dstin, 16]
+ str B_h, [dstin, 24]
+ //stp C_l, C_h, [dstend, -32]
+ str C_l, [dstend, -32]
+ str C_h, [dstend, -24]
1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
+ //stp A_l, A_h, [dstin]
+ str A_l, [dstin]
+ str A_h, [dstin, 8]
+ //stp D_l, D_h, [dstend, -16]
+ str D_l, [dstend, -16]
+ str D_h, [dstend, -8]
ret
.p2align 4
@@ -150,12 +158,24 @@ L(copy96):
ldp D_l, D_h, [src, 48]
ldp E_l, E_h, [srcend, -32]
ldp F_l, F_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin, 32]
- stp D_l, D_h, [dstin, 48]
- stp E_l, E_h, [dstend, -32]
- stp F_l, F_h, [dstend, -16]
+ //stp A_l, A_h, [dstin]
+ str A_l, [dstin]
+ str A_h, [dstin, 8]
+ //stp B_l, B_h, [dstin, 16]
+ str B_l, [dstin, 16]
+ str B_h, [dstin, 24]
+ //stp C_l, C_h, [dstin, 32]
+ str C_l, [dstin, 32]
+ str C_h, [dstin, 40]
+ //stp D_l, D_h, [dstin, 48]
+ str D_l, [dstin, 48]
+ str D_h, [dstin, 56]
+ //stp E_l, E_h, [dstend, -32]
+ str E_l, [dstend, -32]
+ str E_h, [dstend, -24]
+ //stp F_l, F_h, [dstend, -16]
+ str F_l, [dstend, -16]
+ str F_h, [dstend, -8]
ret
/* Align DST to 16 byte alignment so that we don't cross cache line
@@ -171,20 +191,31 @@ L(copy_long):
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
+ //stp D_l, D_h, [dstin]
+ str D_l, [dstin]
+ str D_h, [dstin, 8]
ldp B_l, B_h, [src, 32]
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(last64)
L(loop64):
- stp A_l, A_h, [dst, 16]
+ //stp A_l, A_h, [dst, 16]
+ str A_l, [dst, 16]
+ str A_h, [dst, 24]
ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
+ //stp B_l, B_h, [dst, 32]
+ str B_l, [dst, 32]
+ str B_h, [dst, 40]
ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
+ //stp C_l, C_h, [dst, 48]
+ str C_l, [dst, 48]
+ str C_h, [dst, 56]
ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
+ //stp D_l, D_h, [dst, 64]!
+ str D_l, [dst, 64]
+ str D_h, [dst, 72]
+ add dst, dst, 64
ldp D_l, D_h, [src, 64]!
subs count, count, 64
b.hi L(loop64)
@@ -194,17 +225,33 @@ L(loop64):
there is just 1 byte left. */
L(last64):
ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
+ //stp A_l, A_h, [dst, 16]
+ str A_l, [dst, 16]
+ str A_h, [dst, 24]
ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
+ //stp B_l, B_h, [dst, 32]
+ str B_l, [dst, 32]
+ str B_h, [dst, 40]
ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
+ //stp C_l, C_h, [dst, 48]
+ str C_l, [dst, 48]
+ str C_h, [dst, 56]
ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
+ //stp D_l, D_h, [dst, 64]
+ str D_l, [dst, 64]
+ str D_h, [dst, 72]
+ //stp E_l, E_h, [dstend, -64]
+ str E_l, [dstend, -64]
+ str E_h, [dstend, -56]
+ //stp A_l, A_h, [dstend, -48]
+ str A_l, [dstend, -48]
+ str A_h, [dstend, -40]
+ //stp B_l, B_h, [dstend, -32]
+ str B_l, [dstend, -32]
+ str B_h, [dstend, -24]
+ //stp C_l, C_h, [dstend, -16]
+ str C_l, [dstend, -16]
+ str C_h, [dstend, -8]
ret
.p2align 4
@@ -224,7 +271,9 @@ L(move_long):
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
+ //stp D_l, D_h, [dstend, -16]
+ str D_l, [dstend, -16]
+ str D_h, [dstend, -8]
ldp B_l, B_h, [srcend, -32]
ldp C_l, C_h, [srcend, -48]
ldp D_l, D_h, [srcend, -64]!
@@ -234,13 +283,22 @@ L(move_long):
nop
1:
- stp A_l, A_h, [dstend, -16]
+ //stp A_l, A_h, [dstend, -16]
+ str A_l, [dstend, -16]
+ str A_h, [dstend, -8]
ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
+ //stp B_l, B_h, [dstend, -32]
+ str B_l, [dstend, -32]
+ str B_h, [dstend, -24]
ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
+ //stp C_l, C_h, [dstend, -48]
+ str C_l, [dstend, -48]
+ str C_h, [dstend, -40]
ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
+ //stp D_l, D_h, [dstend, -64]!
+ str D_l, [dstend, -64]
+ str D_h, [dstend, -56]
+ sub dstend, dstend, 64
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
b.hi 1b
@@ -250,17 +308,33 @@ L(move_long):
there is just 1 byte left. */
2:
ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
+ //stp A_l, A_h, [dstend, -16]
+ str A_l, [dstend, -16]
+ str A_h, [dstend, -8]
ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
+ //stp B_l, B_h, [dstend, -32]
+ str B_l, [dstend, -32]
+ str B_h, [dstend, -24]
ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
+ //stp C_l, C_h, [dstend, -48]
+ str C_l, [dstend, -48]
+ str C_h, [dstend, -40]
ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
+ //stp D_l, D_h, [dstend, -64]
+ str D_l, [dstend, -64]
+ str D_h, [dstend, -56]
+ //stp G_l, G_h, [dstin, 48]
+ str G_l, [dstin, 48]
+ str G_h, [dstin, 56]
+ //stp A_l, A_h, [dstin, 32]
+ str A_l, [dstin, 32]
+ str A_h, [dstin, 40]
+ //stp B_l, B_h, [dstin, 16]
+ str B_l, [dstin, 16]
+ str B_h, [dstin, 24]
+ //stp C_l, C_h, [dstin]
+ str C_l, [dstin]
+ str C_h, [dstin, 8]
3: ret
END (MEMCPY)
(if you want to view diff in two windows, pls visit https://www.diffchecker.com/qAgmBLFu)
glibc compiled successfully and reboot is also ok. But when I run glmark2, it generates a bus error:
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
Core was generated by `./build/src/glmark2'.
Program terminated with signal SIGBUS, Bus error.
#0 __memcpy_generic () at ../sysdeps/aarch64/multiarch/../memcpy.S:195
warning: Source file is more recent than executable.
195 str D_l, [dstin]
[Current thread is 1 (Thread 0x7f80e77a70 (LWP 9281))]
(gdb) info registers
x0 0x7f782368ac 547476433068
x1 0x11744640 292832832
x2 0x3de0 15840
x3 0x7f782368a0 547476433056
x4 0x11748420 292848672
x5 0x7f7823a680 547476448896
x6 0x3da941653f800000 4443154410490560512
x7 0x3f8000003f066666 4575657222465807974
x8 0xbf666666 3211159142
x9 0xbf8000003e8ccccd -4647714814396937011
x10 0x3e99999abd1f1347 4510805391665206087
x11 0xbf800000 3212836864
x12 0x3f8000003f0ccccc 4575657222466227404
x13 0x3f0666663da94165 4541429863556923749
x14 0xc 12
x15 0x525521dd864a 90525593863754
x16 0x4d42e8 5063400
x17 0x7f808cc2c0 547617555136
x18 0xbf 191
x19 0x116911f0 292098544
x20 0xc 12
x21 0x7f54001930 546870139184
x22 0x7fe9aad628 549381133864
--Type <RET> for more, q to quit, c to continue without paging--
x23 0x7f7820c000 547476258816
x24 0x4d4000 5062656
x25 0x11719da0 292658592
x26 0x0 0
x27 0x0 0
x28 0x0 0
x29 0x7fe9aad540 549381133632
x30 0x476f40 4681536
sp 0x7fe9aad540 0x7fe9aad540
pc 0x7f808cc3f8 0x7f808cc3f8 <__memcpy_generic+296>
cpsr 0x20001000 [ EL=0 C ]
fpsr 0x11 17
fpcr 0x0 0
(gdb)
(gdb) bt
#0 __memcpy_generic () at ../sysdeps/aarch64/multiarch/../memcpy.S:195
#1 0x0000000000476f40 in std::__copy_move<false, true, std::random_access_iterator_tag>::__copy_m<float> (__result=<optimized out>, __last=<optimized out>,
__first=<optimized out>) at /usr/include/c++/8/bits/stl_iterator.h:783
#2 std::__copy_move_a<false, float*, float*> (__result=<optimized out>,
__last=<optimized out>, __first=<optimized out>)
at /usr/include/c++/8/bits/stl_algobase.h:386
#3 std::__copy_move_a2<false, float*, float*> (__result=<optimized out>,
__last=<optimized out>, __first=<optimized out>)
at /usr/include/c++/8/bits/stl_algobase.h:422
#4 std::copy<float*, float*> (__result=<optimized out>,
__last=<optimized out>, __first=<optimized out>)
at /usr/include/c++/8/bits/stl_algobase.h:455
#5 Mesh::update_single_vbo (this=0x7f54001930,
ranges=std::vector of length 10, capacity 16 = {...}, n=<optimized out>,
nfloats=<optimized out>) at ../src/mesh.cpp:469
#6 0x0000000000478308 in Mesh::update_vbo (this=0x7f54001930,
ranges=std::vector of length 10, capacity 16 = {...})
at /usr/include/c++/8/bits/stl_vector.h:930
#7 0x000000000041e094 in WaveMesh::update (elapsed=0.060067000000003645,
this=0x7f54001930) at ../src/scene-buffer.cpp:163
#8 SceneBuffer::update (this=<optimized out>) at ../src/scene-buffer.cpp:434
#9 0x0000000000416494 in MainLoop::draw (this=0x11717500)
So, it triggered bus error in the first str
here:
L(copy_long):
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
//stp D_l, D_h, [dstin]
str D_l, [dstin] /* Oops, bus error! */
str D_h, [dstin, 8]
Question:
here is disassembler of memcpy:
(gdb) disassemble __memcpy_generic
Dump of assembler code for function __memcpy_generic:
0x0000007f9272d2d0 <+0>: prfm pldl1keep, [x1]
0x0000007f9272d2d4 <+4>: add x4, x1, x2
0x0000007f9272d2d8 <+8>: add x5, x0, x2
0x0000007f9272d2dc <+12>: cmp x2, #0x10
0x0000007f9272d2e0 <+16>: b.ls 0x7f9272d330 <__memcpy_generic+96> // b.plast
0x0000007f9272d2e4 <+20>: cmp x2, #0x60
0x0000007f9272d2e8 <+24>: b.hi 0x7f9272d3e0 <__memcpy_generic+272> // b.pmore
0x0000007f9272d2ec <+28>: sub x14, x2, #0x1
0x0000007f9272d2f0 <+32>: ldp x6, x7, [x1]
0x0000007f9272d2f4 <+36>: tbnz w14, #6, 0x7f9272d390 <__memcpy_generic+192>
0x0000007f9272d2f8 <+40>: ldp x12, x13, [x4, #-16]
0x0000007f9272d2fc <+44>: tbz w14, #5, 0x7f9272d318 <__memcpy_generic+72>
0x0000007f9272d300 <+48>: ldp x8, x9, [x1, #16]
0x0000007f9272d304 <+52>: ldp x10, x11, [x4, #-32]
0x0000007f9272d308 <+56>: str x8, [x0, #16]
0x0000007f9272d30c <+60>: str x9, [x0, #24]
0x0000007f9272d310 <+64>: stur x10, [x5, #-32]
0x0000007f9272d314 <+68>: stur x11, [x5, #-24]
0x0000007f9272d318 <+72>: str x6, [x0]
0x0000007f9272d31c <+76>: str x7, [x0, #8]
0x0000007f9272d320 <+80>: stur x12, [x5, #-16]
0x0000007f9272d324 <+84>: stur x13, [x5, #-8]
0x0000007f9272d328 <+88>: ret
0x0000007f9272d32c <+92>: nop
0x0000007f9272d330 <+96>: cmp x2, #0x8
0x0000007f9272d334 <+100>: b.cc 0x7f9272d350 <__memcpy_generic+128> // b.lo, b.ul, b.last
0x0000007f9272d338 <+104>: ldr x6, [x1]
0x0000007f9272d33c <+108>: ldur x7, [x4, #-8]
0x0000007f9272d340 <+112>: str x6, [x0]
0x0000007f9272d344 <+116>: stur x7, [x5, #-8]
0x0000007f9272d348 <+120>: ret
0x0000007f9272d34c <+124>: nop
0x0000007f9272d350 <+128>: tbz w2, #2, 0x7f9272d368 <__memcpy_generic+152>
0x0000007f9272d354 <+132>: ldr w6, [x1]
0x0000007f9272d358 <+136>: ldur w7, [x4, #-4]
0x0000007f9272d35c <+140>: str w6, [x0]
0x0000007f9272d360 <+144>: stur w7, [x5, #-4]
0x0000007f9272d364 <+148>: ret
0x0000007f9272d368 <+152>: cbz x2, 0x7f9272d388 <__memcpy_generic+184>
0x0000007f9272d36c <+156>: lsr x14, x2, #1
--Type <RET> for more, q to quit, c to continue without paging--
0x0000007f9272d370 <+160>: ldrb w6, [x1]
0x0000007f9272d374 <+164>: ldurb w7, [x4, #-1]
0x0000007f9272d378 <+168>: ldrb w8, [x1, x14]
0x0000007f9272d37c <+172>: strb w6, [x0]
0x0000007f9272d380 <+176>: strb w8, [x0, x14]
0x0000007f9272d384 <+180>: sturb w7, [x5, #-1]
0x0000007f9272d388 <+184>: ret
0x0000007f9272d38c <+188>: nop
0x0000007f9272d390 <+192>: ldp x8, x9, [x1, #16]
0x0000007f9272d394 <+196>: ldp x10, x11, [x1, #32]
0x0000007f9272d398 <+200>: ldp x12, x13, [x1, #48]
0x0000007f9272d39c <+204>: ldp x1, x2, [x4, #-32]
0x0000007f9272d3a0 <+208>: ldp x4, x3, [x4, #-16]
0x0000007f9272d3a4 <+212>: str x6, [x0]
0x0000007f9272d3a8 <+216>: str x7, [x0, #8]
0x0000007f9272d3ac <+220>: str x8, [x0, #16]
0x0000007f9272d3b0 <+224>: str x9, [x0, #24]
0x0000007f9272d3b4 <+228>: str x10, [x0, #32]
0x0000007f9272d3b8 <+232>: str x11, [x0, #40]
0x0000007f9272d3bc <+236>: str x12, [x0, #48]
0x0000007f9272d3c0 <+240>: str x13, [x0, #56]
0x0000007f9272d3c4 <+244>: stur x1, [x5, #-32]
0x0000007f9272d3c8 <+248>: stur x2, [x5, #-24]
0x0000007f9272d3cc <+252>: stur x4, [x5, #-16]
0x0000007f9272d3d0 <+256>: stur x3, [x5, #-8]
0x0000007f9272d3d4 <+260>: ret
0x0000007f9272d3d8 <+264>: nop
0x0000007f9272d3dc <+268>: nop
0x0000007f9272d3e0 <+272>: and x14, x0, #0xf
0x0000007f9272d3e4 <+276>: and x3, x0, #0xfffffffffffffff0
0x0000007f9272d3e8 <+280>: ldp x12, x13, [x1]
0x0000007f9272d3ec <+284>: sub x1, x1, x14
0x0000007f9272d3f0 <+288>: add x2, x2, x14
0x0000007f9272d3f4 <+292>: ldp x6, x7, [x1, #16]
=> 0x0000007f9272d3f8 <+296>: str x12, [x0]
0x0000007f9272d3fc <+300>: str x13, [x0, #8]
0x0000007f9272d400 <+304>: ldp x8, x9, [x1, #32]
0x0000007f9272d404 <+308>: ldp x10, x11, [x1, #48]
0x0000007f9272d408 <+312>: ldp x12, x13, [x1, #64]!
0x0000007f9272d40c <+316>: subs x2, x2, #0x90
0x0000007f9272d410 <+320>: b.ls 0x7f9272d450 <__memcpy_generic+384> // b.plast
--Type <RET> for more, q to quit, c to continue without paging--
0x0000007f9272d414 <+324>: str x6, [x3, #16]
0x0000007f9272d418 <+328>: str x7, [x3, #24]
0x0000007f9272d41c <+332>: ldp x6, x7, [x1, #16]
0x0000007f9272d420 <+336>: str x8, [x3, #32]
0x0000007f9272d424 <+340>: str x9, [x3, #40]
0x0000007f9272d428 <+344>: ldp x8, x9, [x1, #32]
0x0000007f9272d42c <+348>: str x10, [x3, #48]
0x0000007f9272d430 <+352>: str x11, [x3, #56]
0x0000007f9272d434 <+356>: ldp x10, x11, [x1, #48]
0x0000007f9272d438 <+360>: str x12, [x3, #64]
0x0000007f9272d43c <+364>: str x13, [x3, #72]
0x0000007f9272d440 <+368>: add x3, x3, #0x40
0x0000007f9272d444 <+372>: ldp x12, x13, [x1, #64]!
0x0000007f9272d448 <+376>: subs x2, x2, #0x40
0x0000007f9272d44c <+380>: b.hi 0x7f9272d414 <__memcpy_generic+324> // b.pmore
0x0000007f9272d450 <+384>: ldp x1, x2, [x4, #-64]
0x0000007f9272d454 <+388>: str x6, [x3, #16]
0x0000007f9272d458 <+392>: str x7, [x3, #24]
0x0000007f9272d45c <+396>: ldp x6, x7, [x4, #-48]
0x0000007f9272d460 <+400>: str x8, [x3, #32]
0x0000007f9272d464 <+404>: str x9, [x3, #40]
0x0000007f9272d468 <+408>: ldp x8, x9, [x4, #-32]
0x0000007f9272d46c <+412>: str x10, [x3, #48]
0x0000007f9272d470 <+416>: str x11, [x3, #56]
0x0000007f9272d474 <+420>: ldp x10, x11, [x4, #-16]
0x0000007f9272d478 <+424>: str x12, [x3, #64]
0x0000007f9272d47c <+428>: str x13, [x3, #72]
0x0000007f9272d480 <+432>: stur x1, [x5, #-64]
0x0000007f9272d484 <+436>: stur x2, [x5, #-56]
0x0000007f9272d488 <+440>: stur x6, [x5, #-48]
0x0000007f9272d48c <+444>: stur x7, [x5, #-40]
0x0000007f9272d490 <+448>: stur x8, [x5, #-32]
0x0000007f9272d494 <+452>: stur x9, [x5, #-24]
0x0000007f9272d498 <+456>: stur x10, [x5, #-16]
0x0000007f9272d49c <+460>: stur x11, [x5, #-8]
0x0000007f9272d4a0 <+464>: ret
0x0000007f9272d4a4 <+468>: nop
0x0000007f9272d4a8 <+472>: nop
0x0000007f9272d4ac <+476>: nop
0x0000007f9272d4b0 <+480>: cbz x14, 0x7f9272d580 <__memcpy_generic+688>
0x0000007f9272d4b4 <+484>: add x4, x1, x2
--Type <RET> for more, q to quit, c to continue without paging--
0x0000007f9272d4b8 <+488>: add x5, x0, x2
0x0000007f9272d4bc <+492>: and x14, x5, #0xf
0x0000007f9272d4c0 <+496>: ldp x12, x13, [x4, #-16]
0x0000007f9272d4c4 <+500>: sub x4, x4, x14
0x0000007f9272d4c8 <+504>: sub x2, x2, x14
0x0000007f9272d4cc <+508>: ldp x6, x7, [x4, #-16]
0x0000007f9272d4d0 <+512>: stur x12, [x5, #-16]
0x0000007f9272d4d4 <+516>: stur x13, [x5, #-8]
0x0000007f9272d4d8 <+520>: ldp x8, x9, [x4, #-32]
0x0000007f9272d4dc <+524>: ldp x10, x11, [x4, #-48]
0x0000007f9272d4e0 <+528>: ldp x12, x13, [x4, #-64]!
0x0000007f9272d4e4 <+532>: sub x5, x5, x14
0x0000007f9272d4e8 <+536>: subs x2, x2, #0x80
0x0000007f9272d4ec <+540>: b.ls 0x7f9272d530 <__memcpy_generic+608> // b.plast
0x0000007f9272d4f0 <+544>: nop
0x0000007f9272d4f4 <+548>: stur x6, [x5, #-16]
0x0000007f9272d4f8 <+552>: stur x7, [x5, #-8]
0x0000007f9272d4fc <+556>: ldp x6, x7, [x4, #-16]
0x0000007f9272d500 <+560>: stur x8, [x5, #-32]
0x0000007f9272d504 <+564>: stur x9, [x5, #-24]
0x0000007f9272d508 <+568>: ldp x8, x9, [x4, #-32]
0x0000007f9272d50c <+572>: stur x10, [x5, #-48]
0x0000007f9272d510 <+576>: stur x11, [x5, #-40]
0x0000007f9272d514 <+580>: ldp x10, x11, [x4, #-48]
0x0000007f9272d518 <+584>: stur x12, [x5, #-64]
0x0000007f9272d51c <+588>: stur x13, [x5, #-56]
0x0000007f9272d520 <+592>: sub x5, x5, #0x40
0x0000007f9272d524 <+596>: ldp x12, x13, [x4, #-64]!
0x0000007f9272d528 <+600>: subs x2, x2, #0x40
0x0000007f9272d52c <+604>: b.hi 0x7f9272d4f4 <__memcpy_generic+548> // b.pmore
0x0000007f9272d530 <+608>: ldp x2, x3, [x1, #48]
0x0000007f9272d534 <+612>: stur x6, [x5, #-16]
0x0000007f9272d538 <+616>: stur x7, [x5, #-8]
0x0000007f9272d53c <+620>: ldp x6, x7, [x1, #32]
0x0000007f9272d540 <+624>: stur x8, [x5, #-32]
0x0000007f9272d544 <+628>: stur x9, [x5, #-24]
0x0000007f9272d548 <+632>: ldp x8, x9, [x1, #16]
0x0000007f9272d54c <+636>: stur x10, [x5, #-48]
0x0000007f9272d550 <+640>: stur x11, [x5, #-40]
0x0000007f9272d554 <+644>: ldp x10, x11, [x1]
0x0000007f9272d558 <+648>: stur x12, [x5, #-64]
--Type <RET> for more, q to quit, c to continue without paging--
0x0000007f9272d55c <+652>: stur x13, [x5, #-56]
0x0000007f9272d560 <+656>: str x2, [x0, #48]
0x0000007f9272d564 <+660>: str x3, [x0, #56]
0x0000007f9272d568 <+664>: str x6, [x0, #32]
0x0000007f9272d56c <+668>: str x7, [x0, #40]
0x0000007f9272d570 <+672>: str x8, [x0, #16]
0x0000007f9272d574 <+676>: str x9, [x0, #24]
0x0000007f9272d578 <+680>: str x10, [x0]
0x0000007f9272d57c <+684>: str x11, [x0, #8]
0x0000007f9272d580 <+688>: ret
End of assembler dump.
The root cause should be that dst is unaligned to 8(64bit for x0), so triggered bus error. dst in glmar2 is float *dest(dest_start + nfloats * iter->first);
, where nfloats is 3, ri->first is 121 and dest_start is 0xf7ff070f000, so dest is dest_start + nfloats * iter->first * sizeof float)
, it will never be aligned to 8.
Upvotes: 0
Views: 719