BullyWiiPlaza
BullyWiiPlaza

Reputation: 19225

ANTLR4 Grammar for parsing x86 Assembly

I wrote some C++ code and compiled it to Intel style assembly using g++ -S -masm=intel -O3 main.cpp -o main.S. My issue can be reproduced with the following simple hello world program, though.

main.cpp:

#include <iostream>

int main()
{
    std::cout << "Hello World" << std::endl;
    return 0;
}

main.S:

    .file   "main.cpp"
    .intel_syntax noprefix
    .text
    .p2align 4,,15
    .def    ___tcf_0;   .scl    3;  .type   32; .endef
___tcf_0:
LFB2024:
    .cfi_startproc
    mov ecx, OFFSET FLAT:__ZStL8__ioinit
    jmp __ZNSt8ios_base4InitD1Ev
    .cfi_endproc
LFE2024:
    .def    ___main;    .scl    2;  .type   32; .endef
    .section .rdata,"dr"
LC0:
    .ascii "Hello World\0"
    .section    .text.startup,"x"
    .p2align 4,,15
    .globl  _main
    .def    _main;  .scl    2;  .type   32; .endef
_main:
LFB1547:
    .cfi_startproc
    push    ebp
    .cfi_def_cfa_offset 8
    .cfi_offset 5, -8
    mov ebp, esp
    .cfi_def_cfa_register 5
    and esp, -16
    sub esp, 16
    call    ___main
    mov DWORD PTR [esp+8], 11
    mov DWORD PTR [esp+4], OFFSET FLAT:LC0
    mov DWORD PTR [esp], OFFSET FLAT:__ZSt4cout
    call    __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
    mov DWORD PTR [esp], OFFSET FLAT:__ZSt4cout
    call    __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
    xor eax, eax
    leave
    .cfi_restore 5
    .cfi_def_cfa 4, 4
    ret
    .cfi_endproc
LFE1547:
    .p2align 4,,15
    .def    __GLOBAL__sub_I_main;   .scl    3;  .type   32; .endef
__GLOBAL__sub_I_main:
LFB2025:
    .cfi_startproc
    sub esp, 28
    .cfi_def_cfa_offset 32
    mov ecx, OFFSET FLAT:__ZStL8__ioinit
    call    __ZNSt8ios_base4InitC1Ev
    mov DWORD PTR [esp], OFFSET FLAT:___tcf_0
    call    _atexit
    add esp, 28
    .cfi_def_cfa_offset 4
    ret
    .cfi_endproc
LFE2025:
    .section    .ctors,"w"
    .align 4
    .long   __GLOBAL__sub_I_main
.lcomm __ZStL8__ioinit,1,1
    .ident  "GCC: (MinGW.org GCC-8.2.0-3) 8.2.0"
    .def    __ZNSt8ios_base4InitD1Ev;   .scl    2;  .type   32; .endef
    .def    __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl    2;  .type   32; .endef
    .def    __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_;    .scl    2;  .type   32; .endef
    .def    __ZNSt8ios_base4InitC1Ev;   .scl    2;  .type   32; .endef
    .def    _atexit;    .scl    2;  .type   32; .endef

Now I want to use an ANTLR grammar to parse the assembly code in main.S. I tried the one provided here since it seemed promising. However, I got plenty of parse errors. Is it not meant to be used for this task? Is there any other grammar more appropriate for parsing GCC generated assembly?

Parse Tree Inspector:

Lexer output:

line 1:7 token recognition error at: '"'
line 5:6 token recognition error at: '_'
line 5:7 token recognition error at: '_'
line 5:8 token recognition error at: '_'
line 6:0 token recognition error at: '_'
line 6:1 token recognition error at: '_'
line 6:2 token recognition error at: '_'
line 9:22 token recognition error at: '_'
line 9:23 token recognition error at: '_'
line 10:5 token recognition error at: '_'
line 10:6 token recognition error at: '_'
line 13:6 token recognition error at: '_'
line 13:7 token recognition error at: '_'
line 13:8 token recognition error at: '_'
line 14:17 token recognition error at: '"'
line 16:8 token recognition error at: '"'
line 16:20 token recognition error at: '\'
line 16:22 token recognition error at: '"'
line 17:24 token recognition error at: '"'
line 19:8 token recognition error at: '_'
line 20:6 token recognition error at: '_'
line 21:0 token recognition error at: '_'
line 31:6 token recognition error at: '_'
line 31:7 token recognition error at: '_'
line 31:8 token recognition error at: '_'
line 34:34 token recognition error at: '_'
line 34:35 token recognition error at: '_'
line 35:6 token recognition error at: '_'
line 35:7 token recognition error at: '_'
line 36:34 token recognition error at: '_'
line 36:35 token recognition error at: '_'
line 37:6 token recognition error at: '_'
line 37:7 token recognition error at: '_'
line 46:6 token recognition error at: '_'
line 46:7 token recognition error at: '_'
line 47:0 token recognition error at: '_'
line 47:1 token recognition error at: '_'
line 52:22 token recognition error at: '_'
line 52:23 token recognition error at: '_'
line 53:6 token recognition error at: '_'
line 53:7 token recognition error at: '_'
line 54:34 token recognition error at: '_'
line 54:35 token recognition error at: '_'
line 54:36 token recognition error at: '_'
line 55:6 token recognition error at: '_'
line 61:17 token recognition error at: '"'
line 63:7 token recognition error at: '_'
line 63:8 token recognition error at: '_'
line 64:7 token recognition error at: '_'
line 64:8 token recognition error at: '_'
line 65:8 token recognition error at: '"'
line 66:6 token recognition error at: '_'
line 66:7 token recognition error at: '_'
line 67:6 token recognition error at: '_'
line 67:7 token recognition error at: '_'
line 68:6 token recognition error at: '_'
line 68:7 token recognition error at: '_'
line 69:6 token recognition error at: '_'
line 69:7 token recognition error at: '_'
line 70:6 token recognition error at: '_'
line 1:17 no viable alternative at input 'main.cpp"\r\n'
line 2:23 no viable alternative at input 'noprefix\r\n'
line 4:10 mismatched input '4' expecting {'!', EOL}

Upvotes: 1

Views: 1056

Answers (1)

Margaret Bloom
Margaret Bloom

Reputation: 44126

Reading that grammar, and looking and the AST generated, it's easy to see that a line like .p2align 4,,15 is matching the rule lbl for the token .p2align and that then 4,,15 doesn't match any assemblydirective or instruction.

So, No, that's not the grammar you are looking for.

I don't know if there exists any ANTLR grammar for gas, however the syntax is mostly (completely?) line based with directives starting with a dot.
It's easy to parse that.

If you need to parse the code too, consider using AT&T (oh boy, I cannot believe I'm saying this!) as it is made to be easily parseable. If you want to stick with the Intel's syntax, I don't think you need a LL parser in the first place as the assembly language should be regular.
Using a simple FSM or a regex may be easier and more effective.

Upvotes: 4

Related Questions