Lev
Lev

Reputation: 101

Flex/Bison sometimes misses Re

I build a CLI using flex/bison, and I experience that the flex sometimes doesn't get the tokens.

My .l looks like this:

%{

#include <stdio.h>
#include <string.h>
#include "hmd.tab.h"
#include "cmd.h"
%}

%option debug
%option verbose
%option backup

%option noyywrap nounput noinput
%option reentrant bison-bridge



digit [0-9]
integer [+-]?{digit}+
uinteger {digit}+
real [+-]?({digit}+[.]{digit}*)|({digit}*[.]{digit}+)
exp [+-]?({integer}|{real})[eE]-?{integer}
alpha [:alpha:]+
any [^[:space:]](.|\n)+
printing [^[:space:]]+

%x ID ACTION ZONE_FIELD VALUE

%%

    /*subsystems*/
<INITIAL>zone {
    BEGIN(ID);
    printf("ZONE '%s'\n", yytext);
    return (cmd_sys_zone);
}

<INITIAL>device {
    return (cmd_sys_device);
}

<INITIAL>system {
    return (cmd_sys_system);
}

<INITIAL>help {
    return (cmd_sys_help);
}

<INITIAL>ver|version {
    return (cmd_sys_ver);
}


<ID>{uinteger} {
    printf("ID '%s'\n", yytext);
    yylval->number = strtoll(yytext, NULL, 0);
    BEGIN (ACTION);
    return (cmd_id);
}

    /*actions*/
<ACTION>set {
    BEGIN (ZONE_FIELD);
    printf("SET '%s'\n", yytext);
    return (cmd_action_set);
}
<ACTION>get {
    BEGIN (ZONE_FIELD);
    return (cmd_action_get);
}
<ACTION>start {
    BEGIN (ZONE_FIELD);
    return (cmd_action_start);
}
<ACTION>stop {
    BEGIN (ZONE_FIELD);
    return (cmd_action_stop);
}

<ZONE_FIELD>{alpha} {
    printf("ZONE_FIELD '%s'\n", yytext);
    yylval->name = strdup(yytext);
    BEGIN (VALUE);
    return (cmd_field);
}

<VALUE>{any} {
        yylval->name = strdup(yytext);
        printf("VALUE '%s'\n", yytext);
        return(cmd_value);
    }

%%


int cmd_parse(cmd_t *command) {
    yyscan_t scanner;
    YY_BUFFER_STATE buffer;
    int ret_val;

    ret_val = 0;

    if ((ret_val = yylex_init(&scanner)) != 0) {
        goto exit_point;
    }

    printf("INPUT '%s'\n", command->buffer);

    buffer = yy_scan_buffer(command->buffer, command->len, scanner);
    yyparse(command, scanner);
    yy_delete_buffer(buffer, scanner);
    yylex_destroy(scanner);
exit_point:
    return 0;
}

and my .y looks like this:

%{
#define YYDEBUG 1
#include <stdio.h>
#include <stdint.h>
#include "cmd.h"
#include "hmd.tab.h"

int yylex();
int yyerror(void *userdata, void *scanner, const char *s);

%}

%debug
%define api.pure

%define parse.error verbose

/*System tokens*/
%token cmd_sys_zone cmd_sys_device cmd_sys_system cmd_sys_ver cmd_sys_help

%token cmd_num cmd_unum cmd_real cmd_other

/*ID token*/
%token cmd_id

/*Fields*/
%token cmd_field

/*Action tokens*/
%token cmd_action_set cmd_action_get cmd_action_start cmd_action_stop

/*Value*/
%token cmd_value

%type <number> cmd_num
%type <unumber> cmd_unum
%type <real> cmd_real
%type <unumber> cmd_id
%type <name> cmd_field
%type <name> cmd_value
%type <name> cmd_other

%destructor {
    if ($$ == NULL) {
        free($$);
    }
} <name>

%union {
    char *name;
    int64_t number;
    uint64_t unumber;
    double real;
}


%parse-param {void *user_data}
%param {void *scanner}

%%

prog:
  stmts
;

stmts:
        | stmt stmts

stmt:
        cmd_sys_zone cmd_id cmd_action_set cmd_field cmd_value {
            cmd_zone_set(user_data, $2, $4, $5);
            cmd_free($4);
            cmd_free($5);
        } |
        cmd_sys_zone cmd_id cmd_action_get cmd_field {
            cmd_zone_get($2, $4);
            cmd_free($4);
        } |
        cmd_sys_ver {
            cmd_ver(user_data);
        } |
        cmd_sys_help {
            cmd_help();
        } |
        cmd_other {
            yyerror(user_data, NULL, $1);
            cmd_free($1);
        }

%%

int yyerror(void *userdata, void *scanner, const char *s)
{
    (void) scanner;
    cmd_t *cmd;
    cmd = (cmd_t*) userdata;

    cmd->response_len = sprintf(cmd->response, "ERR: %s\r\n", s);
    return 0;}

Two similar test cases:

INPUT 'zone 2 set haha some good result
'
ZONE 'zone'
 ID '2'
 SET 'set'
 ZONE_FIELD 'haha'
 VALUE 'some good result
'
2022-05-17T04:31:43 I CMD_SET_ZONE '2' 'haha' 'some good result /*Output of the handler*/
'
INPUT 'zone 2 set blab some bad result
'
ZONE 'zone'
 ID '2'
 SET 'set'
 bZONE_FIELD 'la' /*b is missed by Flex*/
VALUE 'b some bad result /*That b should be part of ZONE_FIELD*/
'
2022-05-17T04:31:59 I CMD_SET_ZONE '2' 'la' 'b some bad result /*Output of the handler*/
'

As you can see I give almost the same amount of data to the parser, and the outcome differs. The second time, there are a bunch of bytes not matched, and the whole grammar collapses.

Upvotes: 0

Views: 89

Answers (1)

rici
rici

Reputation: 241791

If you use the --debug (or -d) command-line flag when generating your scanner, flex will insert code which logs all rule matches (and certain other significant events). In reentrant scanners, such as yours, you also need to insert a call to yyset_debug(1, scanner); to enable the logs; in non-reentrant scanners, the logs are enabled by default. This generally gives you better debugging information than inserting your own printf calls in your scanner actions, and is far less work. (Particularly when it comes time to turn it off.)

I suspect it would have given you enough information to see the typo in your code, which was to define

alpha [:alpha:]+

instead of the correct:

alpha [[:alpha:]]+

As written, {alpha} will match haha, papa and lala. But it won't match blabla because b isn't one of the letters ahlp and nor is it a colon. With debugging enabled (as above), you would have seen something like this in your output:

--accepting rule at line 85 ("set")
 SET 'set'
--accepting default rule (" ")
--accepting default rule ("b")
--accepting rule at line 103 ("la")
 bZONE_FIELD 'la'

Aside from showing that b is not matched by {alpha}, it shows that you're not correctly handling whitespace; probably, you should add a pattern which matches and ignores horizontal whitespace (or maybe all whitespace):

<*>[ \t]+    ;

I also recommend not relying on the automatic fallback rule. Writing patterns sets which match all possibilities (and using %option nodefault to ensure that all possibilities are matched by some rule) also helps you catch simple pattern errors.

Upvotes: 1

Related Questions