Assembly call to printf - different behavior on WSL and standalone Ubuntu 18.04 64bit

I am currently in the process of building a compiler as part of a bachelor project. A hard requirement is that it should be able to run on Linux as a 64 bit binary.

Amongst other functionality, the language we are implementing offers the ability to write to stdout. As part of building a compiler, we generate code into assembly, and we are allowed to call printf from assembly to print to stdout.

However, we are currently experiencing some unexpected behavior with printf. It seems to behave differently or depend on different things based on whether we are running the compiler on WSL or an actual, physical Linux distro. Basically, it gives segmentation fault on WSL in many cases but prints out the correct output and gives no error on a physical distro.

The reason for asking this question is that after days of debugging we cannot really find that there are any issues with how we generate our code. We're thus left with the question of whether there is a particular difference between these two options in how it handles calls to c library calls, or if we have to go back to our code generation and have another go.

The reason we believe it is printf causing the issues is that valgrind indicates that the program terminates when calling printf, and it also seems to behave weirdly or cause issues on WSL in certain situations.

We use AT&T syntax, not that it matters particularly.

EDIT:

Example (written in "kitty", the language we are implementing):

var x: int;
var b: record of {c: int};
func factorial(n: int): int
    var a:int;
    var c:int;
    var d:int;
    var e:int;
    var f:int;
    var g:int;
    var h:int;
    var i:int;
    var j:int;
    var k:int;
    var l:int;
    var m:int;

   b.c = b.c + 1;
   m = n;
   if (m == 0) || (m == 1) then{
      return 1;
      }
   else{
      return m * factorial(m-1);
      }
end factorial

allocate b;
b.c = 5;
write factorial(6);
write b.c;

Records are similar to C structs. The reason for all the variable declarations is to force the code generator to use the stack.

Assembly output:

.section .data 
    heap: 
    next: 

.section .rodata 
    form:    .asciz "%d\n"

.globl main 

# Function definition START

L1:
push %rbp
movq %rsp, %rbp
push %rbx
subq $112, %rsp

L2:
cmp $1, %rdx
je L4
movq 16(%rbp), %rsi
jmp L3

L4:
movq %rbp, %rsi

L3:
movq $0, %rbx
push %rdx
movq 80(%rsi), %rdx
leaq (%rdx, %rbx, 8), %rcx
pop %rdx

L5:
cmp $1, %rdx
je L7
movq 16(%rbp), %rsi
jmp L6

L7:
movq %rbp, %rsi

L6:
movq $0, %rbx
push %rdx
movq 80(%rsi), %rdx
leaq (%rdx, %rbx, 8), %rcx
pop %rdx
movq $1, -40(%rbp)
movq (%rcx), %rbx
movq %rbx, -48(%rbp)
movq -40(%rbp), %rbx
addq %rbx, -48(%rbp)
movq -48(%rbp), %rbx
movq %rbx, (%rcx)
movq $1, -64(%rbp)
movq 96(%rbp), %rbx
cmp %rbx, -64(%rbp)
je L10
movq $0, -72(%rbp)
jmp L11

L10:
movq $1, -72(%rbp)

L11:
movq $0, -80(%rbp)
movq 96(%rbp), %rbx
cmp %rbx, -80(%rbp)
je L12
movq $0, -88(%rbp)
jmp L13

L12:
movq $1, -88(%rbp)

L13:
cmp $0, -88(%rbp)
jne L8
cmp $0, -72(%rbp)
jne L8
movq $0, -56(%rbp)
jmp L9

L8:
movq $1, -56(%rbp)

L9:
cmp $0, -56(%rbp)
je L14
movq $1, -96(%rbp)
movq -96(%rbp), %rax
jmp L15

L14:
movq $1, -104(%rbp)
movq 96(%rbp), %rbx
movq %rbx, -112(%rbp)
movq -104(%rbp), %rbx
subq %rbx, -112(%rbp)
push -112(%rbp)
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
cmp $1, %rdx
jle L16
push 16(%rbp)
jmp L17

L16:
push %rbp

L17:
addq $1, %rdx
call L1
addq $8, %rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
addq $8, %rsp
movq 96(%rbp), %rbx
imulq %rax, %rbx
movq %rbx, -120(%rbp)
movq -120(%rbp), %rax

L15:
addq $112, %rsp
pop %rbx
movq %rbp, %rsp
pop %rbp
subq $1, %rdx
ret

# Function definition END
main: 

# Program prologue.
push %rbp
movq %rsp, %rbp
push %rbx

# Allocating heap.
movq $16384, %rdi
call malloc

# Assigning "heap" and "next" to start of heap.
movq %rax, heap
movq %rax, next

# Stack frame level. Starts at 0.
movq $0, %rdx
movq $next, %r8
movq $1, %rcx
imulq $8, %rcx
addq %rcx, next
movq $0, %rbx
leaq (%r8, %rbx, 8), %rcx
movq $5, %r9
movq %r9, (%rcx)
movq $6, %r10
push %r10
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
cmp $1, %rdx
jle L18
push 16(%rbp)
jmp L19

L18:
push %rbp

L19:
addq $1, %rdx
call L1
addq $8, %rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
addq $8, %rsp
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
push %rsi
push %rdx
push %rcx
leaq form(%rip), %rdi
movq %rax, %rsi
xor %rax, %rax
call printf
pop %rcx
pop %rdx
pop %rsi
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
movq $0, %rbx
leaq (%r8, %rbx, 8), %rcx
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
push %rsi
push %rdx
push %rcx
leaq form(%rip), %rdi
movq (%rcx), %rsi
xor %rax, %rax
call printf
pop %rcx
pop %rdx
pop %rsi
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi

# Program epilogue.
pop %rbx
movq %rbp, %rsp
pop %rbp
movq $0, %rax
ret

The expected result from this code should be that it prints 720 (from factorial(6)) and 11 (b.c is assigned to 5 and then incremented 6 times).

On WSL this gives a segmentation fault, while on a physical distro it prints the correct values and terminates gracefully.

We use gcc -m64 -no-pie "filename" to compile this assembly output.

When running GDB on WSL with the executable, the debugger returns the following:

Starting program: 

Program received signal SIGSEGV, Segmentation fault.
__GI___tcgetattr (fd=1, termios_p=termios_p@entry=0x7ffffffed528) at ../sysdeps/unix/sysv/linux/tcgetattr.c:42
42      ../sysdeps/unix/sysv/linux/tcgetattr.c: No such file or directory.

Using x/i $rip within GDB returns the following:

0x7fffff115cc1 <__GI___tcgetattr+65>:        movdqa (%rsp),%xmm0

And info registers returns:

rax            0x0      0
rbx            0xffffffffffffff80       -128
rcx            0x0      0
rdx            0x0      0
rsi            0xff     255
rdi            0x1      1
rbp            0x7fffff3ec760   0x7fffff3ec760 <_IO_2_1_stdout_>
rsp            0x7ffffffed4d8   0x7ffffffed4d8
r8             0x7ffffffed518   140737488278808
r9             0x0      0
r10            0x0      0

Upvotes: 4

Views: 435

Answers (1)

fransie
fransie

Reputation: 147

I had the same problem and the answer described by Michael Petch and Jester in the discussion was really helpful so I'll summarise it here for reference.

Intel Instruction Set Reference Vol. 2 about the instruction movdqa: "When the source or destination operand is a memory operand, the operand must be aligned on a 16-byte boundary or a general-protection exception (#GP) will be generated."

The memory operand in movdqa (%rsp),%xmm0 points to the top of the stack via rsp at the address 0x7ffffffed4d8. This address is not divisible by 16. Hence, the a general-protection exception resulted from the stack not being aligned on a 16-byte boundary, which causes the signal SIGSEGV in System V ABI.

Upvotes: 2

Related Questions