Nathan Ringo
Nathan Ringo

Reputation: 1003

Page faults with reads of higher half addresses in 64-bit kernel

I'm writing a 64-bit higher half kernel with Rust and NASM assembler. I'm using a Multiboot2 (GRUB2) compatible bootloader to initially get my kernel loaded. I'm experiencing a Page Fault Error (0x0e exception) when my kernel is run in QEMU and I don't understand why. The issue I am getting is appearing in my assembly code before it reaches the code written in Rust.

I'm setting up the paging so that memory looks like:

0000000000000000: 0000000000000000 --PDA---W
0000000000200000: 0000000000200000 --P-----W
ffffff0000000000: 0000000000000000 --P-----W
ffffff7f80000000: 0000000000000000 X-P------

(this is both my intention, and the result of info mem from QEMU)

The tables look like:

p4: # pml4
    0o000 <- p3_low | PRESENT | WRITABLE
    0o776 <- p3_hgh | PRESENT | WRITABLE
p3_low: # pdpte
    0o000 <- p2_low | PRESENT | WRITABLE
p3_hgh: # pdpte
    0o000 <- p2_krn | PRESENT | WRITABLE
    0o667 <- p2_mbi | PRESENT | WRITABLE
p2_low: # pde
    0o000 <- 0o000000_000_000_000_000_0000 | PRESENT | WRITABLE | PAGESIZE
    0o001 <- 0o000000_000_000_001_000_0000 | PRESENT | WRITABLE | PAGESIZE
p2_krn: # pde
    0o000 <- 0o000000_000_000_000_000_0000 | PRESENT | WRITABLE | PAGESIZE
p2_mbi: # pde
    0o000 <- 0o000000_000_000_000_000_0000 | PRESENT | PAGESIZE | NOEXEC

Everything else is zeroed out.


Relevant code from my projects are in these files:

macros64.asm:

%macro pte_write 4
    mov rax, %4
    or rax, %3
    mov qword [%1+8*%2], rax
%endmacro

paging64.asm:

extern kernel_start
extern kernel_end

p_present  equ (1<<0)
p_writable equ (1<<1)
p_user     equ (1<<2)
p_pagesize equ (1<<7)
p_noexec   equ (1<<63)

[section .text]

enable_paging:
    ; Calculate start and end address of the multiboot2 info structure.
    mov r9, rdi
    mov r10, r9
    add r10d, dword [r9]
    and r9, 0xfffffffffffff000
    shr r10, 12
    inc r10
    shl r10, 12
    ; Clear out all the page tables.
    movaps xmm1, [blank]
    mov rcx, page_tables_start
.clear_page_tables_loop:
    movaps [rcx], xmm1
    add rcx, 16
    cmp rcx, page_tables_end
    jl .clear_page_tables_loop
    ; TODO Uncomment the recursive page mappings once things actually work -- for now, they just make "info tlb" in QEMU annoying to read.
    ; Fill out the P4 table.
    pte_write p4, 0o000, p3_low, p_present | p_writable
    pte_write p4, 0o776, p3_hgh, p_present | p_writable
;   pte_write p4, 0o777, p4,     p_present | p_writable | p_noexec
    ; Fill out the P3 tables.
    pte_write p3_low, 0o000, p2_low, p_present | p_writable
;   pte_write p3_low, 0o777, p3_low, p_present | p_writable | p_noexec
    pte_write p3_hgh, 0o000, p2_krn, p_present | p_writable
    pte_write p3_hgh, 0o776, p2_mbi, p_present | p_writable
;   pte_write p3_hgh, 0o777, p3_hgh, p_present | p_writable | p_noexec
    ; Identity map the lowest 2MiB.
    pte_write p2_low, 0o000, 0o000000_000_000_000_000_0000, p_present | p_writable | p_pagesize
    pte_write p2_low, 0o001, 0o000000_000_000_001_000_0000, p_present | p_writable | p_pagesize
;   pte_write p2_low, 0o777, p2_low, p_present | p_writable | p_noexec
    ; Map the kernel.
    xor rcx, rcx
    mov rsi, kernel_start
.kernel_loop:
    pte_write p2_krn, rcx, rsi, p_present | p_writable | p_pagesize
    inc rcx
    add rsi, 0o000000_000_000_001_000_0000
    cmp rsi, kernel_end
    jb .kernel_loop
    ; Map the multiboot2 information structure.
    xor rcx, rcx
    mov rsi, r9
.mbi_loop:
    pte_write p2_mbi, rcx, rsi, p_present | p_pagesize | p_noexec
    inc rcx
    add rsi, 0o000000_000_000_001_000_0000
    cmp rsi, r10
    jb .mbi_loop
    ; Load the new page table. We don't need to flush the TLB because we moved into CR3.
    mov rax, p4
    mov cr3, rax
    ; Return.
    ret

[section .data]
align 0x10
blank: times 0x10 db 0x00

[section .bss]

alignb 4096
page_tables_start:

p4: resb 4096
p3_low: resb 4096
p3_hgh: resb 4096
p2_low: resb 4096
p2_krn: resb 4096
p2_mbi: resb 4096

page_tables_end:

start64.asm:

bits 64

extern kmain
global start64

%include "macros64.asm"
%include "paging64.asm"

[section .text]

;; The entry point for 64-bit code. We expect the address of the multiboot2
;; info structure in rdi.
start64:
    ; Save the address of the multiboot2 info structure.
    push rdi
    ; Clear interrupts. If we get an interrupt before we have an IDT, we'll
    ; triple fault. We can re-enable it from Rust, later.
    cli
    ; Nuke the segment registers.
    mov rax, 0x10
    mov ss, ax
    mov ds, ax
    mov es, ax
    mov fs, ax
    mov gs, ax
    ; Set up paging.
    call enable_paging
    ; The first argument to kmain is the multiboot2 info structure. We need to
    ; adjust the address to the new higher-half location.
    pop rdi
    mov rax, 0xffffff7f80000000
    add rdi, rax
    ; DEBUG
    mov dword [0xb8004], 0xf021f021
    mov rbx, [rdi]
    mov dword [0xb8000], 0xf021f021
    hlt
    ; Call kmain. It's more than 4GiB away, so we have to do an indirect call.
    mov rax, kmain
    call rax
    ; kmain should never return; call halt if it does.
    jmp halt

halt:
    ; Write "kexit?!?" to the upper right corner.
    mov dword [0xb8000], 0x4f654f6b
    mov dword [0xb8004], 0x4f694f78
    mov dword [0xb8008], 0x4f3f4f74
    mov dword [0xb800c], 0x4f3f4f21
    ; Disable interrupts and halt.
    cli
    hlt
    ; Just in case... something? happens.
    jmp halt

After I move the new page table into CR3, execution continues correctly. However, once I attempt to read a value from high memory in start64.asm, I get a page fault. The fault occurs on this line:

    mov rbx, [rdi]

The line before that does mov dword [0xb8004], 0xf021f021 correctly writes !! to the screen. [rdi] is a higher half address where the Multiboot2 information record can be found.

A copy of the complete code can be found in my GIT repository.

Upvotes: 2

Views: 884

Answers (1)

Michael Petch
Michael Petch

Reputation: 47573

I'm going to take an educated guess based on another experiment I did. I suspect that if you were to scroll back and look at the exceptions thrown (one of your images has the beginning of them cut off) that you'd see output like this in QEMU:

check_exception old: 0xffffffff new 0xe
0: v=0e e=0009 i=0 cpl=0 ... [snip]

In particular you'll be looking for an exception that starts with new 0xe which is the Page Fault Exception . I've snipped for brevity.

On the second line you are likely seeing e=0009. This is the error code that will be pushed on the stack before entering into your page handler. You don't have a page handler so you triple fault and you'll get other exceptions afterwards.

What does an error code of 0x0009 mean? OSDev Wiki has a description:

31              4               0
+---+--  --+---+---+---+---+---+---+
|   Reserved   | I | R | U | W | P |
+---+--  --+---+---+---+---+---+---+

P 1 bit - Present - When set, the page fault was caused by a page-protection violation.
                    When not set, it was caused by a non-present page.
W 1 bit - Write   - When set, the page fault was caused by a page write.
                    When not set, it was caused by a page read.
U 1 bit - User    - When set, the page fault was caused while CPL = 3. This does not
                    necessarily mean that the page fault was a privilege violation.
R 1 bit - Reserved write    - When set, the page fault was caused by
                              reading a 1 in a reserved field.
I 1 bit - Instruction Fetch - When set, the page fault was caused by 
                              an instruction fetch.

Your value e=0009 is a bit mask of 01001. This would mean a page-protection violation occurred (and not a page not present error), and it means a 1 was read from a reserved field.

The reserved fields (bits) in question are in the actual Page Table Entries (PTE) of the Page Table (PT) in the bottom of the page table hierarchy. When using a 2MB page size with no Page Attribute Tables the PTEs in the Page Table must have bits 12 through 20 set to zero. This is the case for your current code. Reserved bits are special in that if they contain the value 1 then you will get an e=0009 in the QEMU output.

To resolve this you have to make sure the page table entries (PTEs) in the actual Page Tables (PTs) have these bits set to 0. A quick hack might be to do something like this in macros64.asm:

%macro pte_write 4
    mov rax, %4
    or rax, %3
    mov qword [%1+8*%2], rax
%endmacro

%macro pte_write_res 4
    mov rax, %4
    mov r11, 0x7fffffffffe00000
    and r11, %3
    or  rax, r11
    mov qword [%1+8*%2], rax
%endmacro

The main difference is that pte_write_res specifically enforces the rule for reserved bits by making them 0. You'd then have to modify your code that uses these macros. In your case it would appear to be in these 2 locations within paging64.asm:

.kernel_loop:
    pte_write p2_krn, rcx, rsi, p_present | p_writable | p_pagesize
    inc rcx

Would now become:

.kernel_loop:
    pte_write_res p2_krn, rcx, rsi, p_present | p_writable | p_pagesize
    inc rcx

And

.mbi_loop:
    pte_write p2_mbi, rcx, rsi, p_present | p_pagesize | p_noexec
    inc rcx

Would now become:

.mbi_loop:
    pte_write_res p2_mbi, rcx, rsi, p_present | p_pagesize | p_noexec
    inc rcx

In both these cases we need to write to the Page Table Entries of the Page Table where RSI possibly has bits set that we need to be 0.

Upvotes: 3

Related Questions