Reputation: 417
I am trying to print an AST and printing an Identifier's actual name in the tree. I am using lex and yacc.
For some reason yacc reads ahead all the tokens in a single row until ';' and this prevents me from using yytext to access an identifier's text value.
This is the lex file:
%{
%}
%%
"&&" {return OP_AND;}
\"|"\"|" {return OP_OR;}
!= {return OP_NE;}
== {return OP_EQ;}
> {return OP_GT;}
>= {return OP_GE;}
\< {return OP_LT;}
\<= {return OP_LE;}
! {return OP_NOT;}
= {return ASSIGN;}
\+ {return PLUS;}
\- {return MINUS;}
\* {return MULT;}
\/ {return DIV;}
\& {return ADRS;}
\^ {return PTR_VAL;}
\; {return SEMI;}
, {return COMMA;}
\{ {return CURLY_O;}
\} {return CURLY_C;}
\( {return PAREN_O;}
\) {return PAREN_C;}
\| {return BAR;}
\[ {return SQR_O;}
\] {return SQR_C;}
else {return ELSE;}
if {return IF;}
do {return DO;}
for {return FOR;}
bool {return BOOL;}
void {return VOID;}
int { return INT;}
charp {return CHARP;}
intp {return INTP;}
string {return STRING;}
while {return WHILE;}
true {return TRUE_;}
false {return FALSE_;}
return {return RETURN;}
null {return NULL_;}
0|[1-9][0-9]* {return CONST_INT;}
[a-zA-Z][a-zA-Z0-9]* { return IDENT;}
\'.\' {return CONST_CHAR; }
\".*\" {return CONST_STRING; }
\[\$([^\$]*\$+[^\]])*[^\$]*\$+\] { /*comment*/}
[ \n\t] { /* skip whitespace */}
. {return ILLEAGAL;}
%%
This is the yacc file: * note that some of the rules are not completely ready but are not used by this input file and are not relevant to this case.
%{
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
extern char * yytext;
typedef struct node
{
char * token;
struct node *b1;
struct node *b2;
struct node *b3;
struct node *b4;
}node;
int yylex();
int yyerror();
struct node* mknode2(char* token,node* b1,node* b2);
struct node* mknode3(char* token,node*b1,node*b2,node*b3);
struct node* mknode4(char* token,node*b1,node*b2,node*b3,node*b4);
int printtree(node* tree);
#define YYSTYPE node*
%}
%token SEMI COMMA CURLY_O CURLY_C PAREN_O PAREN_C SQR_O SQR_C BAR
%token BOOL INT CHAR STRING INTP CHARP VOID
%token IF DO WHILE FOR RETURN
%token CONST_INT CONST_CHAR CONST_STRING NULL_ IDENT TRUE_ FALSE_
%token ILLEAGAL
%right ASSIGN
%left OP_OR OP_AND OP_NE OP_EQ OP_GT OP_GE OP_LT OP_LE
%left PLUS MINUS
%left MULT DIV
%right OP_NOT
%right ADRS
%right PTR_VAL
%nonassoc LOWER_THAN_ELSE
%nonassoc ELSE
%start program
%%
program : functions {printtree($1);}
;
functions : functions function {$$ = mknode2("FUNC" ,$2,$1);}
| {$$ = NULL;}
;
function : type id PAREN_O f_params PAREN_C CURLY_O body CURLY_C { $$ = mknode3($1,$2,$4,$7);}
;
body : var_decls stmts ret_stmt {$$ = mknode3("BODY" ,$1 , $2 , $3);}
;
nest_block : stmts {$$=$1;}
;
f_params : f_params_ {$$ = $1;}
| {$$ = NULL;}
;
f_params_ : f_params_ COMMA param {$$ = mknode2("," , $3 , $1);}
| param {$$ = $1;}
;
param : type id {$$ = mknode2($1 , NULL , $2 );}
;
var_decls : var_decls var_decl { $$ = mknode2("DECL" , $2 , $1);}
| var_decls function {$$ = mknode2("DECL" , $2 , $1);}
| {$$ = NULL;}
;
var_decl : type var_list SEMI { $$ = mknode2($1 , NULL , $2); }
;
var_list : var_list COMMA dec_assign {$$ = mknode2(",", $3 , $1);}
| dec_assign { $$ = $1 ;}
;
dec_assign : id ASSIGN expr { $$ = mknode2("=", $1 , $3);}
| id {$$ = $1; }
;
type : BOOL {$$ = yytexy;}
| INT { $$ = yytext;}
| CHAR {$$ = yytext;}
| INTP {$$ = yytext;}
| CHARP {$$ = yytext;}
| STRING {$$ = yytext;}
| VOID {$$ = yytext;}
;
stmts : stmts stmt { $$ = mknode2("STMT" , $2 , $1 );}
| { $$ = NULL ;}
;
stmt : assignment SEMI {$$ = $1 ;}
| fct_call SEMI {$$ = $1; }
| IF PAREN_O expr PAREN_C opt_nest %prec LOWER_THAN_ELSE {$$ = mknode2("if" , $3 , $5 );}
| IF PAREN_O expr PAREN_C opt_nest ELSE opt_nest {$$ = mknode3("if" , $3 , $5 , $7 );}
| WHILE PAREN_O expr PAREN_C opt_nest {$$ = mknode2("while" , $3 , $5);}
| DO CURLY_O nest_block CURLY_C WHILE PAREN_O expr PAREN_C SEMI {$$ = mknode2("do" , $3 , $7);}
| FOR PAREN_O assignment SEMI expr SEMI assignment PAREN_C opt_nest {$$ = mknode4("for" , $3 , $5 , $7 , $9 );}
;
opt_nest : CURLY_O nest_block CURLY_C {$$ = $2;}
| stmt {$$ = $1;}
;
ret_stmt : RETURN expr SEMI { $$ = mknode2("return" , NULL , $2);}
| { $$ = NULL; }
;
assignment : id ASSIGN expr {$$ = mknode2("=" , $1, $3);}
| id SQR_O expr SQR_C ASSIGN expr {$$ = mknode3("=" , $1 , $3, $6 );}
;
fct_call : id PAREN_O expr_list PAREN_C {$$ = mknode2("FUNCALL" , $1 ,$3 );}
;
expr_list : expr_list_ expr {$$ = mknode2("AGRS" , $2 , $1);}
| {$$ = NULL;}
;
expr_list_ : expr_list_ expr COMMA { $$ = mknode2("," , $2 , $1);}
| {$$ = NULL;}
;
id : IDENT { $$ = mknode2(strdup(yytext) , NULL , NULL);}
;
expr : expr PLUS expr { $$ = mknode2("+" , $1 , $3);}
| expr MINUS expr { $$ = mknode2("-" , $1 , $3);}
| expr MULT expr { $$ = mknode2("*" , $1 , $3);}
| expr DIV expr { $$ = mknode2("/" , $1 , $3);}
| expr OP_AND expr { $$ = mknode2("&&" , $1 , $3);}
| expr OP_OR expr { $$ = mknode2("||" , $1 , $3);}
| expr OP_NE expr { $$ = mknode2("!=" , $1 , $3);}
| expr OP_EQ expr {$$=mknode2("==",$1,$3);}
| expr OP_GT expr {$$=mknode2(">",$1,$3);}
| expr OP_GE expr {$$=mknode2(">=",$1,$3);}
| expr OP_LT expr {$$=mknode2("<",$1,$3);}
| expr OP_LE expr {$$=mknode2("<=",$1,$3);}
| OP_NOT expr {$$=mknode2("!",NULL,$2);}
| PTR_VAL expr {$$=mknode2("^",NULL,$2);}
| ADRS expr {$$=mknode2("&",NULL,$2);}
| MINUS expr {$$=mknode2("-",NULL,$2);}
| literal {$$=mknode2($1,NULL,NULL);}
| fct_call {$$= $1 ;}
| id {$$= $1;}
| PAREN_O expr PAREN_C {$$=mknode2("(_)",NULL,$2);}
| BAR expr BAR {$$=mknode2("|_|",NULL,$2);}
;
literal : CONST_INT {$$ = "const_int";}
| TRUE_ { $$ = "true" ;}
| FALSE_ { $$ = "false" ;}
| CONST_CHAR { $$ = "const_char" ;}
| CONST_STRING { $$ = "const_string" ;}
| NULL_ { $$ = "null" ;}
;
%%
#include "lex.yy.c"
void main(){
return yyparse();
}
int tabCount =0;
struct node* mknode2(char* token,node*b1,node*b2)
{
node* newnode=(node*)malloc(sizeof(node));
char* newstr;
if( token ){
printf("token: %s \n" , token);
newstr=(char*)malloc(sizeof(token)+1);
newstr[sizeof(token)]='\0';
strcpy(newstr,token);
}
else{
newstr=(char*)malloc(3);
strcpy(newstr,"AA");
}
newnode->b1=b1;
newnode->b2=b2;
newnode->b3=NULL;
newnode->b4=NULL;
newnode->token=newstr;
return newnode;
}
struct node* mknode3(char* token,node*b1,node*b2,node*b3)
{
node* newnode= mknode2(token,b1,b2);
newnode->b3=b3;
return newnode;
}
struct node* mknode4(char* token,node*b1,node*b2,node*b3,node*b4)
{
node* newnode= mknode3(token,b1,b2,b3);
newnode->b4=b4;
return newnode;
}
void printTabs(){
int i;
for(i=0;i<tabCount;i++){
printf("\t");
}
}
int printtree(node* tree)
{
tabCount++;
printTabs();
printf("%s\n",tree->token);
if(tree->b1)printtree(tree->b1);
if(tree->b2)printtree(tree->b2);
if(tree->b3)printtree(tree->b3);
if(tree->b4)printtree(tree->b4);
tabCount--;
return 1;
}
int yyerror(){
printf("ERROR!\n");
return 0;
}
this is the input file:
void main(int x){
int y,w,r; int z;
}
this is the output AST in preorder:
FUNC
void
main
int x
x
BODY
DECL
int z;
;
DECL
int y,w,r;
,
;
,
,
,
the output i expected:
FUNC
void
main
int
x
BODY
DECL
int
z
DECL
int
,
y
,
w
r
As can be seen when I get to the rule: type in my yacc file yytext contains the whole sentence ("int y,w,r;" instead of just "int") and when I get to the rule: id it only contain one token, that happens to be one token forward then the expected token.
- Note that in the function name and the function arguments yytext works properly.
- I tried printing yytext from the lex file and the tokens seems to contain right values.
Upvotes: 0
Views: 1293
Reputation: 241721
yytext
points to the last lexeme scanned. Any other previously-saved pointer into the scanner's intrrnal buffer (such as a previous value of yytext
) is invalid, and xannot be assumed to point at anything useful.
In a parser action, you have really very little idea what the last token scanned is. Yacc/bison generate LALR(1) parsers, where the 1
indicates "one lookahead token". That means that the parser (can) always examine the next token, and if it does so that is the token yytext
will point at. Some implementations always read ahead; others, like bison, sometimes do not read ahead if the lookahead won't change parser behaviour. So which token yytext
points at during a parser action is unspecified and may change in future versions or with different implementations.
Bottom line: Never use yytext
in a parser action. And never pass the pointer value of yytext
from the scanner to the parser. If you will need the string value of the token in the parser, make a copy of the string and pass the address of the copy to the parser through yylval
. If you use strdup
or similar, don't forget to free
the copy when you no longer need it.
Upvotes: 1