Victor
Victor

Reputation: 14593

Keep the LocatedSpan of an "outer" parser with nom in Rust

I am trying to write a parser using the nom crate (and the nom_locate) that can parse strings such as u{12a}, i.e.:

u\{([0-9a-fA-F]{1,6})\}

I wrote the following parser combinator:

use nom::bytes::complete::{take_while_m_n};
use nom::character::complete::{char};
use nom::combinator::{map_opt, map_res};

use nom::sequence::{delimited, preceded};


pub type LocatedSpan<'a> = nom_locate::LocatedSpan<&'a str>;
pub type IResult<'a, T> = nom::IResult<LocatedSpan<'a>, T>;

#[derive(Clone, Debug)]
pub struct LexerError<'a>(LocatedSpan<'a>, String);

fn expect<'a, F, E, T>(
    mut parser: F,
    err_msg: E,
) -> impl FnMut(LocatedSpan<'a>) -> IResult<Option<T>>
where
    F: FnMut(LocatedSpan<'a>) -> IResult<T>,
    E: ToString,
{
    use nom::error::Error as NomError;
    move |input| match parser(input) {
        Ok((remaining, output)) => Ok((remaining, Some(output))),
        Err(nom::Err::Error(NomError { input, code: _ }))
        | Err(nom::Err::Failure(NomError { input, code: _ })) => {
            let err = LexerError(input, err_msg.to_string());
            // TODO Report error.
            println!("error: {:?}", err);
            Ok((input, None))
        }
        Err(err) => Err(err),
    }
}

fn lit_str_unicode_char(input: LocatedSpan) -> IResult<char> {
    let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());
    // FIXME Figure out a way to keep correct span here.
    let parse_delim_hex = preceded(
        char('u'),
        delimited(
            char('{'),
            expect(parse_hex, "expected 1-6 hex digits"),
            expect(char('}'), "expected closing brace"),
        ),
    );
    let parse_u32 = map_res(parse_delim_hex, move |hex| match hex {
        None => Err("cannot parse number"),
        Some(hex) => match u32::from_str_radix(hex.fragment(), 16) {
            Ok(val) => Ok(val),
            Err(_) => Err("invalid number"),
        },
    });
    map_opt(parse_u32, std::char::from_u32)(input)
}

fn main() {
    let raw = "u{61}";
    let span = LocatedSpan::new(raw);
    let result = lit_str_unicode_char(span);
    println!("{:#?}", result);
}

This works correctly, I am able to get the Unicode character out of the string. However, this approach does not keep the proper spans, i.e.:

u{123}
\..../  <--- the span I want
     \/ <--- the span I get

I figured I could wrap the parse_delim_hex in a recognize, which would keep the span correctly, but then I couldn't use the following parsers to "understand" the digits.

How should I get around this issue?

Upvotes: 1

Views: 358

Answers (1)

Finomnis
Finomnis

Reputation: 22601

I think you misunderstand the purpose of the first parameter of IResult.

Quote from the documentation:

The Ok side is a pair containing the remainder of the input (the part of the data that was not parsed) and the produced value.

The span you are looking at is not the data that was found, but instead the data that was left over afterwards.

I think what you were trying to achieve is something along those lines:

use nom::bytes::complete::take_while_m_n;
use nom::character::complete::char;
use nom::combinator::{map_opt, map_res};
use nom::{InputTake, Offset};

use nom::sequence::{delimited, preceded};

pub type LocatedSpan<'a> = nom_locate::LocatedSpan<&'a str>;
pub type IResult<'a, T> = nom::IResult<LocatedSpan<'a>, T>;

#[derive(Clone, Debug)]
pub struct LexerError<'a>(LocatedSpan<'a>, String);

fn expect<'a, F, E, T>(
    mut parser: F,
    err_msg: E,
) -> impl FnMut(LocatedSpan<'a>) -> IResult<Option<T>>
where
    F: FnMut(LocatedSpan<'a>) -> IResult<T>,
    E: ToString,
{
    use nom::error::Error as NomError;
    move |input| match parser(input) {
        Ok((remaining, output)) => Ok((remaining, Some(output))),
        Err(nom::Err::Error(NomError { input, code: _ }))
        | Err(nom::Err::Failure(NomError { input, code: _ })) => {
            let err = LexerError(input, err_msg.to_string());
            // TODO Report error.
            println!("error: {:?}", err);
            Ok((input, None))
        }
        Err(err) => Err(err),
    }
}

fn lit_str_unicode_char(input: LocatedSpan) -> IResult<(char, LocatedSpan)> {
    let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());
    // FIXME Figure out a way to keep correct span here.
    let parse_delim_hex = preceded(
        char('u'),
        delimited(
            char('{'),
            expect(parse_hex, "expected 1-6 hex digits"),
            expect(char('}'), "expected closing brace"),
        ),
    );
    let parse_u32 = map_res(parse_delim_hex, |hex| match hex {
        None => Err("cannot parse number"),
        Some(hex) => match u32::from_str_radix(hex.fragment(), 16) {
            Ok(val) => Ok(val),
            Err(_) => Err("invalid number"),
        },
    });

    // Do the actual parsing
    let (s, ch) = map_opt(parse_u32, std::char::from_u32)(input)?;
    let span_offset = input.offset(&s);
    let span = input.take(span_offset);
    Ok((s, (ch, span)))
}

fn main() {
    let span = LocatedSpan::new("u{62} bbbb");
    let (rest, (ch, span)) = lit_str_unicode_char(span).unwrap();
    println!("Leftover: {:?}", rest);
    println!("Character: {:?}", ch);
    println!("Parsed Span: {:?}", span);
}
Leftover: LocatedSpan { offset: 5, line: 1, fragment: " bbbb", extra: () }
Character: 'b'
Parsed Span: LocatedSpan { offset: 0, line: 1, fragment: "u{62}", extra: () }

Upvotes: 2

Related Questions