Reputation: 546
I am trying to run my self-made compiler for a simplified java language, however I always end up with a segmentation fault. The goal is to build an abstract syntax tree. I also tried to debug with valgrind, but it looks like the tool has some problems with bison and flex on my PC. Please help me.
The Lexfile:
%{
#include <stdio.h>
#include "abst.h"
#include "yacc.tab.h"
int yylineno;
int column = 0;
int nesting_comments = 0;
int max_comment_level = 0;
int total_comments_number = 0;
void count() {
int i;
for(i = 0; yytext[i] != '\0'; i++){
if(yytext[i] == '\n')
column = 0;
else if(yytext[i] == '\t')
column += 8 - (column % 8);
else
column++;
}
//printf("%s", yytext); Uncomment this Line for printing the input stream
}
void yyerror(char const *s);
%}
DIGIT [0-9]
LETTER [a-zA-Z]
WHITESPACE [ \t\r]
VINT {DIGIT}+
VFLOAT {VINT}+"."{DIGIT}*
VDOUBLE {VFLOAT}E{VINT}
VSTRING \".*\"
COMMENT1 "//".*
%x COMMENT2
COMMENT3 "/""*"([^{"*""/"}])*"*""/"
REL_OP "<"|"<="|"=="|"=>"|">"|"&&"|"||"|"&"|"|"|"!="
ADD_OP "+"|"-"|"%"
MUL_OP "*"|"/"
UNARY_OP "++"|"--"
ASSIGN_OP "+="|"-="|"*="|"/="|"%="
TYPE "boolean"|"int"|"float"|"double"
ID {LETTER}({LETTER}|{DIGIT})*
%%
"if" { count(); return IF; }
"else" { count(); return ELSE; }
"while" { count(); return WHILE; }
"for" { count(); return FOR; }
"read" { count(); return READ; }
"write" { count(); return WRITE; }
"final" { count(); return FINAL; }
{VINT} { count(); yylval.ival = atoi(yytext); return IVALUE; }
{VFLOAT}|{VDOUBLE} { count(); yylval.fval = atof(yytext); return FVALUE; }
{VSTRING} { count(); yylval.identifier = strdup(yytext); return STRING; }
"true" { count(); return TRUE; }
"false" { count(); return FALSE; }
"boolean" { count(); return BOOLEAN; }
"int" { count(); return INT; }
"float" { count(); return FLOAT; }
"double" { count(); return DOUBLE; }
{ID} { count(); yylval.identifier = strdup(yytext); return IDENT; }
{REL_OP} { count(); yylval.ival = getEnum(yytext); return RELOP; }
{ADD_OP} { count(); yylval.ival = getEnum(yytext); return ADDOP; }
{MUL_OP} { count(); yylval.ival = getEnum(yytext); return MULOP; }
{UNARY_OP} { count(); yylval.ival = getEnum(yytext); return UNARYOP; }
"=" { count(); return '='; }
"!" { count(); return '!'; }
"(" { count(); return '('; }
")" { count(); return ')'; }
"[" { count(); return '['; }
"]" { count(); return ']'; }
"{" { count(); return '{'; }
"}" { count(); return '}'; }
"," { count(); return ','; }
";" { count(); return ';'; }
{COMMENT1} {
total_comments_number++;
count();
}
"(*" {
total_comments_number++;
BEGIN(COMMENT2);
printf("(* [BEGIN NESTED %d]\n", nesting_comments++);
count();
}
<COMMENT2>[^*()]* {
printf("%s", yytext); /* Eat other characters */
count();
}
<COMMENT2>"(*" {
printf("(* [BEGIN NESTED %d]\n", nesting_comments);
if(++nesting_comments > max_comment_level) {
max_comment_level = nesting_comments;
}
count();
}
<COMMENT2>"*)" {
printf("\n*) [END NESTED %d]", --nesting_comments);
if (nesting_comments == 0){
BEGIN(INITIAL);
}
count();
}
<COMMENT2>[*()] {
printf("%s", yytext); /* Eat the other characters if it is not a comment delimiter */
count();
}
{COMMENT3} {
total_comments_number++;
count();
}
{WHITESPACE}+ { count(); }
\n { count(); yylineno++; }
. { yyerror("Lex"); }
%%
The Yaccfile:
%{
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include "abst.h"
extern char *yytext;
extern int yylineno;
extern FILE *yyin;
extern int column;
extern int nesting_comments;
extern int max_comment_level;
extern int total_comments_number;
int yylex(void);
//int yydebug=1;
node *root;
void yyerror (char const *s){
fprintf (stderr, "%s Error at line %d:%d\n", s, yylineno, column);
fprintf (stderr, "Text was: %s\n", yytext);
exit(EXIT_FAILURE);
}
%}
%union {
char *identifier;
int ival;
float fval;
struct _node *node;
}
%type <identifier> IDENT STRING
%type <ival> IVALUE MULOP RELOP ADDOP UNARYOP
%type <fval> FVALUE
%type <node> program compStmt varDecList varListType type varList ident identArray stmtList
%type <node> statement assignStmt ifStmt elsePart whileStmt forStmt ioStmt ioCall paramList
%type <node> expr simpleExpr term factor mulop addop relop
%token IF ELSE WHILE FOR WRITE READ FINAL
%token IVALUE FVALUE TRUE FALSE STRING
%token BOOLEAN INT FLOAT DOUBLE
%token IDENT
%right UNARYOP '!' '='
%left MULOP ADDOP RELOP
%left '[' ']' '(' ')'
%start program
%%
program
: compStmt { $$ = $1; root = $1;}
;
compStmt
: '{' varDecList stmtList '}' { $$ = new_node(COMP_STMT, (node*[]){ $2, $3, NULL });}
;
varDecList
: varListType varDecList { $$ = new_node(DECLARATION, (node*[]){ $1, $2, NULL }); }
|
;
varListType
: type varList { $$ = new_node(VAR_LIST_TYPE, (node*[]){ $1, $2, NULL }); }
| FINAL type varList { $$ = new_node(VAR_LIST_TYPE, (node*[]){ new_string_node(TYPE, "final") , $2, $3, NULL }); }
;
type
: BOOLEAN { $$ = new_string_node(TYPE, "boolean"); }
| INT { $$ = new_string_node(TYPE, "int"); }
| FLOAT { $$ = new_string_node(TYPE, "float"); }
| DOUBLE { $$ = new_string_node(TYPE, "double"); }
;
varList
: ident ',' varList { $$ = new_node(VAR_LIST, (node*[]){ $1, $3, NULL }); }
| ident ';' { $$ = new_node(VAR_LIST, (node*[]){ $1, NULL }); }
| ident '=' expr ';' { $$ = new_node(VAR_LIST, (node*[]){ $1, $3, NULL }); }
;
ident
: IDENT { $$ = new_string_node(VAR, $1); }
| identArray '[' simpleExpr ']' { $$ = new_node(ARRAY, (node*[]){ $1, $3, NULL }); }
;
identArray
: IDENT { $$ = new_string_node(VAR, $1); }
;
stmtList
: statement stmtList { $$ = new_node(STATEMENT_LIST, (node*[]){ $1, $2, NULL }); }
| statement { $$ = new_node(STATEMENT_LIST, (node*[]){ $1, NULL }); }
;
statement
: assignStmt ';' { $$ = $1; }
| compStmt { $$ = $1; }
| ifStmt { $$ = $1; }
| whileStmt { $$ = $1; }
| forStmt { $$ = $1; }
| ioStmt { $$ = $1; }
;
assignStmt
: ident '=' expr { $$ = new_node(ASSIGN_STMT, (node*[]){ $1, $3, NULL }); }
| ident UNARYOP { $$ = new_node(ASSIGN_STMT, (node*[]){ $1, new_int_node(OP, $2), NULL }); }
;
ifStmt
: IF '(' expr ')' statement elsePart {
if($6->type != NONE){
$$ = new_node(IF_STMT, (node*[]){ $3, $5, $6, NULL });
}
else {
$$ = new_node(IF_STMT, (node*[]){ $3, $5, NULL });
}
}
;
elsePart
: ELSE statement { $$ = $2; }
| { $$ = new_none(); }
;
whileStmt
: WHILE '(' expr ')' statement { $$ = new_node(WHILE_STMT, (node*[]){ $3, $5, NULL }); }
;
forStmt
: FOR '(' assignStmt ';' expr ';' assignStmt ')' statement {
$$ = new_node(FOR_STMT, (node*[]){ $3, $5, $7, $9, NULL });
}
;
ioStmt
: ioCall '(' paramList ')' ';' { $$ = new_node(IOFUNC, (node*[]){ $1, $3, NULL }); }
;
ioCall
: READ { $$ = new_int_node(IOFUNCCALL, 0); }
| WRITE { $$ = new_int_node(IOFUNCCALL, 1); }
;
paramList
: ident ',' paramList { $$ = new_node(PARAM_LIST, (node*[]){ $1, $3, NULL }); }
| STRING ',' paramList { $$ = new_node(PARAM_LIST, (node*[]){ new_string_node(STRING_CONST, $1), $3, NULL }); }
| ident { $$ = new_node(PARAM_LIST, (node*[]){ $1, NULL }); }
| STRING { $$ = new_string_node(STRING_CONST, $1); }
;
expr
: simpleExpr relop simpleExpr { $$ = new_node(EXPR, (node*[]){ $1, $2, $3, NULL }); }
| simpleExpr { $$ = new_node(EXPR, (node*[]){ $1, NULL }); }
;
simpleExpr
: term addop simpleExpr { $$ = new_node(EXPR, (node*[]){ $1, $2 ,$3, NULL }); }
| term { $$ = new_node(EXPR, (node*[]){ $1, NULL }); }
;
term
: factor mulop term { $$ = new_node(EXPR, (node*[]){ $1, $2, $3, NULL }); }
| factor { $$ = new_node(EXPR, (node*[]){ $1, NULL }); }
;
factor
: IVALUE { $$ = new_int_node(INT_CONST, $1); }
| FVALUE { $$ = new_float_node(REAL_CONST, $1); }
| TRUE { $$ = new_int_node(BOOL_CONST, 1); }
| FALSE { $$ = new_int_node(BOOL_CONST, 0); }
| ident { $$ = $1; }
| '!' factor { $$ = new_node(EXPR, (node*[]){ $2, NULL }); }
| '-' factor { $$ = new_node(EXPR, (node*[]){ $2, NULL }); }
| '(' expr ')' { $$ = new_node(EXPR, (node*[]){ $2, NULL }); }
;
mulop
: MULOP { $$ = new_int_node(OP, $1); }
;
addop
: ADDOP { $$ = new_int_node(OP, $1); }
;
relop
: RELOP { $$ = new_int_node(OP, $1); }
;
%%
int main(int argc, char **argv) {
if ( argc > 1 )
yyin = fopen( argv[1], "r" );
else
yyin = stdin;
yyparse();
printf("\nCOMMENT-ANALYSIS:\n");
printf("Total number of comments: %d\n", total_comments_number);
printf("Maximum nested comment level: %d\n", max_comment_level);
printf("\n");
printNode(root);
return 0;
}
The Header in which I decleared some structs and enums:
#ifndef AST_TYPES_H
#define AST_TYPES_H
typedef enum {
COMP_STMT = 0,
ASSIGN,
ASSIGN_STMT,
IF_STMT,
FOR_STMT,
WHILE_STMT,
STATEMENT,
STATEMENT_LIST,
CONST,
VAR,
ARRAY,
VAR_LIST,
TYPE,
EXPR,
SIMPLE_EXPR,
INT_CONST,
REAL_CONST,
BOOL_CONST,
STRING_CONST,
IDENTIFIER,
OP,
IOFUNC,
IOFUNCCALL,
PARAM_LIST,
DECLARATION,
VAR_LIST_TYPE,
NONE,
} node_type;
typedef enum {
PLUS = 0,
MINUS,
MUL,
DIV,
MOD,
LT,
LE,
GT,
GE,
EQ,
NE,
AND,
OR,
PLUSPLUS,
MINUSMINUS,
NOT,
} operator;
typedef union _node_value{
int iValue; /*integer, true, false, compOp, addOp, mulOp */
float fValue; /*number*/
char* identifier; /*identifier*/
/* list of BNF right-hand side symbols of nonterminal type */
struct _node **body;
} node_value;
typedef struct _node {
node_type type;
int size;
node_value value;
} node;
void printNode(node *the_node);
operator getEnum(char *value);
char *printEnum(node_type type);
node* new_none();
node* new_int_node(node_type type, int value);
node* new_float_node(node_type type, float value);
node* new_string_node(node_type type, char *value);
node* new_node(node_type type, node **nodes);
#endif //AST_TYPES_H
The c-file which implements the headerfunctions:
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include "abst.h"
node* new_none(){
node *the_node = (node*) malloc(sizeof(node));
if (the_node == NULL) {
fprintf(stderr, "could not allocated memory for node\n");
exit(EXIT_FAILURE);
}
the_node->type = NONE;
the_node->size = 0;
return the_node;
}
operator getEnum(char *value){
switch (value[0]) {
case '+':
if(value[1] == '+')
return PLUSPLUS;
else
return PLUS;
case '-':
if(value[1] == '-')
return MINUSMINUS;
else
return MINUS;
case '*':
return MUL;
case '/':
return DIV;
case '%':
return MOD;
case '<':
if(value[1] == '=')
return LE;
else
return LT;
case '>':
if(value[1] == '=')
return GE;
else
return GT;
case '=':
if(value[1] == '!')
return NE;
else
return EQ;
case '&':
return AND;
case '|':
return OR;
case '!':
return NOT;
}
return -1;
}
node* new_int_node(node_type type, int value){
node *the_node = (node*) malloc(sizeof(node));
if (the_node == NULL) {
fprintf(stderr, "could not allocated memory for node\n");
exit(EXIT_FAILURE);
}
the_node->type = type;
the_node->size = 0;
switch(type){
case INT_CONST:
case OP:
case BOOL_CONST:
case IOFUNCCALL:
the_node->value.iValue = value;
break;
default:
fprintf (stderr, "Node-Type %s is wrong for int value %d\n", printEnum(type), value);
}
return the_node;
}
node* new_float_node(node_type type, float value){
node *the_node = (node*) malloc(sizeof(node));
if (the_node == NULL) {
fprintf(stderr, "could not allocated memory for node\n");
exit(EXIT_FAILURE);
}
the_node->type = type;
the_node->size = 0;
if(type == REAL_CONST)
the_node->value.fValue = value;
else
fprintf (stderr, "Node-Type %s is wrong for float value %f\n", printEnum(type), value);
return the_node;
}
node* new_string_node(node_type type, char* value){
node *the_node = (node*) malloc(sizeof(node));
if (the_node == NULL) {
fprintf(stderr, "could not allocated memory for node\n");
exit(EXIT_FAILURE);
}
the_node->type = type;
the_node->size = 0;
switch(type){
case VAR:
case STRING_CONST:
case TYPE:
the_node->value.identifier = value;
break;
default:
fprintf (stderr, "Node-Type %s is wrong for String %s\n", printEnum(type), value);
}
return the_node;
}
node* new_node(node_type type, node **nodes){
int counter = 0;
while(nodes[counter] != NULL){
counter++;
if(counter > 4){
fprintf (stderr, "Could not find NULL for %s\n", printEnum(type));
exit(EXIT_FAILURE);
}
}
node *the_node = (node*) malloc(sizeof(node));
the_node->type = type;
/*
switch(type){
case STATEMENT_LIST:
case DECLARATION:
case VAR_LIST:
if(counter == 2 && nodes[1]->type != EXPR){
int size = nodes[1]->size + 1;
the_node->size = size;
the_node->value.body = malloc(size*sizeof(node*));
the_node->value.body[0] = nodes[0];
for (int i = 1; i < size; i++) {
the_node->value.body[i] = nodes[1]->value.body[i-1];
}
return the_node;
}
}
*/
the_node->value.body = (node**) malloc(counter*sizeof(node*));
the_node->size = counter;
for (int i = 0; i < counter; i++) {
the_node->value.body[i] = nodes[i];
}
return the_node;
}
void printNode(node *the_node){
if(the_node->type > NONE){
fprintf (stderr, "Wrong Type %d\n", the_node->type);
return;
}
printEnum(the_node->type);
switch(the_node->type){
case INT_CONST:
printf("%s with INT-value: %d\n", printEnum(the_node->type), the_node->value.iValue);
break;
case REAL_CONST:
printf("%s with FLOAT-value: %f\n", printEnum(the_node->type), the_node->value.fValue);
break;
case BOOL_CONST:
printf("%s with BOOL-value: %d\n", printEnum(the_node->type), the_node->value.iValue);
break;
case VAR:
printf("%s with Identifier: %s\n", printEnum(the_node->type), the_node->value.identifier);
break;
case OP:
printf("%s with Operation: %d\n", printEnum(the_node->type), the_node->value.iValue);
break;
default:
printf("%s with size %d\n", printEnum(the_node->type), the_node->size);
for (int i = 0; i < the_node->size; i++) {
printNode(the_node->value.body[i]);
}
}
}
char *printEnum(node_type type){
switch (type) {
case COMP_STMT:
return "COMP_STMT";
case ASSIGN:
return "ASSIGN";
case ASSIGN_STMT:
return "ASSIGN_STMT";
case IF_STMT:
return "IF_STMT";
case FOR_STMT:
return "FOR_STMT";
case WHILE_STMT:
return "WHILE_STMT";
case STATEMENT:
return "STATEMENT";
case STATEMENT_LIST:
return "STATEMENT_LIST";
case CONST:
return "CONST";
case VAR:
return "VAR";
case ARRAY:
return "ARRAY";
case VAR_LIST:
return "VAR_LIST";
case TYPE:
return "TYPE";
case EXPR:
return "EXPR";
case INT_CONST:
return "INT_CONST";
case REAL_CONST:
return "REAL_CONST";
case BOOL_CONST:
return "BOOL_CONST";
case STRING_CONST:
return "STRING_CONST";
case IDENTIFIER:
return "IDENTIFIER";
case OP:
return "OP";
case IOFUNC:
return "IOFUNC";
case IOFUNCCALL:
return "IOFUNCCALL";
case PARAM_LIST:
return "PARAM_LIST";
case DECLARATION:
return "DECLARATION";
case VAR_LIST_TYPE:
return "VAR_LIST_TYPE";
case NONE:
return "NONE";
default:
printf("\nUNDEFINED: %d\n", type);
return "UNDEFINED";
}
}
I compile the program like this:
bison -d --debug --verbose yacc.y
flex lex.l
gcc yacc.tab.c lex.yy.c abst.c -std=c11 -Wall -o run -ll -g
./run < sample.java
This is the sample-file for the java grammar, which should be compiled:
/* This program that implements a few well-known algorithms */
{ int x, y, z;
final int n = 10;
int a[n];
// greatest common divisor
{ read(x, y);
while(x * y != 0)
if(x > y)
x = x - y;
else y = y - x;
if(x == 0)
write("gcd = ", y);
else write("gcd = ", x);
}
// factorial calculation
{ read(x);
y = 1;
for(z = 2; z <= x; z++)
y = y * z;
write(x, " != ", y);
}
// prime number verification
{ int k;
boolean prime;
read(x);
k = 2;
prime = true;
while((k < x/2) && prime) {
if(x % k == 0)
prime = false;
k++;
}
write(x, " is prime is ", prime);
}
// maximum array element
{ int i, max;
for(i = 0; i < n; i++)
read(a[i]);
max = a[0];
for(i = 0; i < n; i++)
if(max > a[i])
max = a[i];
write("max = ", max);
}
// bubble sort
{ boolean k;
float r;
int i;
k = true;
while(k) {
k = false;
for(i = 0; i < n-1; i++)
if(a[i] > a[i+1]) {
b = a[i];
a[i] = a[i+1];
a[i+1] = b;
k = true;
}
}
write("sorted array: ");
for(i = 0; i < n; i++)
write(a[i]);
}
}
Upvotes: 1
Views: 2434
Reputation: 126536
Instead of running the program directly from the command line, do:
gdb run
then at the (gdb)
prompt do
run < sample.java
which will produce output ending with something like:
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff7a66c36 in vfprintf () from /lib/x86_64-linux-gnu/libc.so.6
(gdb)
telling you that the problem is in something passed to printf
. From here, use the bt
command to get a stack backtrace:
#0 0x00007ffff7a66c36 in vfprintf () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff7a6f299 in printf () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x0000000000404b14 in printNode (the_node=0x60c870) at abst.c:178
#3 0x0000000000404b94 in printNode (the_node=0x60c930) at abst.c:186
#4 0x0000000000404b94 in printNode (the_node=0x60c970) at abst.c:186
#5 0x0000000000404b94 in printNode (the_node=0x60ca70) at abst.c:186
#6 0x0000000000404b94 in printNode (the_node=0x60d190) at abst.c:186
#7 0x0000000000404b94 in printNode (the_node=0x60d6d0) at abst.c:186
#8 0x0000000000404b94 in printNode (the_node=0x60d710) at abst.c:186
#9 0x0000000000404b94 in printNode (the_node=0x60d750) at abst.c:186
#10 0x0000000000404b94 in printNode (the_node=0x613500) at abst.c:186
#11 0x0000000000404b94 in printNode (the_node=0x613540) at abst.c:186
#12 0x00000000004024a7 in main (argc=1, argv=0x7fffffffe0a8) at yacc.y:208
to see how you got into this state. Now use the up
command twice (to go two frames "up" in the stack trace -- down on the list above), and you're at:
#2 0x0000000000404b14 in printNode (the_node=0x60c870) at abst.c:178
178 printf("%s with Identifier: %s\n", printEnum(the_node->type), the_node->value.identifier);
from here you can print the values of things with the p
command:
(gdb) p the_node->value
$1 = {
iValue = 2,
fValue = 2.80259693e-45,
identifier = 0x2 <error: Cannot access memory at address 0x2>,
body = 0x2
}
which tells you what the (immediate) problem is -- you're accessing identifier
from the union when it actually holds what is probably an iValue
, so you're passing a garbage pointer to printf
which causes the crash.
For more information on gdb commands, try the help
command...
Upvotes: 6
Reputation: 311054
You can't rely on yytext
anywhere in the .y file. As yacc is an LALR(1) parser, it's already gone. You need to copy it to a piece of dynamic memory in the lexer, and assign it to a member of the yylval
union:
yylval.text = strdup(yytext);
As you are writing C++, you should not use node-type variables. You should create a family of node classes, that inherit from node
, and equipment them with constructors and virtual methods to do whatever they need to do in each specific case.
Upvotes: 3
Reputation: 241931
That's really a lot of code, too much for a simple visual inspection. You should probably try executing the program with a debugger, enable bison tracing, and/or write some test code to verify that your node-building functions work as expected. (I'd go with all three.)
One thing which does leap out is that you have a number of rules like this:
BOOLEAN { $$ = new_node(TYPE, (node*[]){ $1, NULL }); }
which assume that there is some semantic value associated with a terminal token representing a simple keyword. However, the flex rule which returns the keyword token type
"boolean" { count(); return BOOLEAN; }
does not set yylval
. In fact, no flex rule ever sets yylval, so it will always be NULL. (It's a global variable, so it will be zero-initialized.)
So if the node*
used to construct the new type
node in the bison rule above is ever dereferenced, you'll get a segfault. I didn't trace through the code to see if it is ever dereferenced, so that may or may not explain your issue.
Upvotes: 3