Philipp Gesang
Philipp Gesang

Reputation: 526

token meaning dependent on context

I have a weird string syntax where the meaning of a delimiter depends on context. In the following sample input:

( (foo) (bar) )

the result is a list of two strings ["foo"; "bar"]. The outer pair of parenthesis enters list mode. Then, the next pair of parentheses delimits the string. Inside strings, balanced pairs of parentheses are to be treated as part of the string.

Right now the lexer decides what to return depending on a global variable inside.

{
  open Sample_parser
  exception Error of string
  let inside = ref false (* <= to be eliminated *)
}

The delimiters are parentheses. If the lexer hits an opening parenthesis, then

If a closing parenthesis is encountered outside a string, a Leave token is emitted and inside is unset.

My question is: How do I rewrite the lexer without the global variable inside?

Fwiw I use menhir but afaict the same would be true for ocamlyacc. (Sorry if this sounds confused, I’m really a newbie to the yacc/lex approach. I can express all the above without thinking as a PEG but I haven’t got used to mentally keeping lexer and parser separated. Feel free to point out other issues with the code!)

Simple example: *sample_lexer.mll*

{
  open Sample_parser
  exception Error of string
  let inside = ref false (* <= to be eliminated *)
}

let lpar  = "("
let rpar  = ")"
let ws    = [' ' '\t' '\n' '\r']

rule tokenize = parse
  | ws    { tokenize lexbuf }
  | lpar  { if not !inside then begin
              inside := true;
              Enter
            end else begin
              let buf = Buffer.create 20 in
              String (string_scanner
                        (Lexing.lexeme_start lexbuf)
                        0
                        buf
                        lexbuf)
            end }
  | rpar  { inside := false; Leave }
and string_scanner init depth buf = parse
  | rpar  { if depth = 0 then begin
              Buffer.contents buf;
            end else begin
              Buffer.add_char buf ')';
              string_scanner init (depth - 1) buf lexbuf end }
  | lpar  { Buffer.add_char buf '(';
            string_scanner init (depth + 1) buf lexbuf }
  | eof   { raise (Error (Printf.sprintf
                           "Unexpected end of file inside string, pos %d--%d]!\n"
                           init
                           (Lexing.lexeme_start lexbuf))) }
  | _ as chr { Buffer.add_char buf chr;
               string_scanner init depth buf lexbuf }

*sample_scanner.mly*:

%token <string> String
%token Enter
%token Leave

%start <string list> process

%%

process:
  | Enter lst = string_list Leave { lst }

string_list:
  | elm = element lst = string_list { elm :: lst }
  | elm = element                   { [elm]      }

element:
  | str = String { str }

main.ml:

open Batteries

let sample_input = "( (foo (bar) baz) (xyzzy) )"
(*                  EibssssssssssssseibssssseiL
 * where E := enter inner
 *       L := leave inner
 *       i := ignore (whitespace)
 *       b := begin string
 *       e := end string
 *       s := part of string
 *
 * desired result: [ "foo (bar) baz"; "xyzzy" ] (type string list)
 *)

let main () =
  let buf = Lexing.from_string sample_input in
  try
    List.print
      String.print stdout
      (Sample_parser.process Sample_lexer.tokenize buf);
    print_string "\n";
  with
  | Sample_lexer.Error msg   -> Printf.eprintf "%s%!" msg
  | Sample_parser.Error      -> Printf.eprintf
                                    "Invalid syntax at pos %d.\n%!"
                                    (Lexing.lexeme_start buf)

let _ = main ()

Upvotes: 1

Views: 360

Answers (1)

Martin Jambon
Martin Jambon

Reputation: 4939

You can pass the state as an argument to tokenize. It still has to be mutable, but not global.

rule tokenize inside = parse
  | ws    { tokenize inside lexbuf }
  | lpar  { if not !inside then begin
              inside := true;
              Enter
            end else begin
              let buf = Buffer.create 20 in
              String (string_scanner
                        (Lexing.lexeme_start lexbuf)
                        0
                        buf
                        lexbuf)
            end }
  | rpar  { inside := false; Leave }

And you call the parser as follows:

Sample_parser.process (Sample_lexer.tokenize (ref false)) buf

Upvotes: 3

Related Questions