Reputation: 14593
I am trying to write a parser using the nom
crate (and the nom_locate
) that can parse strings such as u{12a}
, i.e.:
u\{([0-9a-fA-F]{1,6})\}
I wrote the following parser combinator:
use nom::bytes::complete::{take_while_m_n};
use nom::character::complete::{char};
use nom::combinator::{map_opt, map_res};
use nom::sequence::{delimited, preceded};
pub type LocatedSpan<'a> = nom_locate::LocatedSpan<&'a str>;
pub type IResult<'a, T> = nom::IResult<LocatedSpan<'a>, T>;
#[derive(Clone, Debug)]
pub struct LexerError<'a>(LocatedSpan<'a>, String);
fn expect<'a, F, E, T>(
mut parser: F,
err_msg: E,
) -> impl FnMut(LocatedSpan<'a>) -> IResult<Option<T>>
where
F: FnMut(LocatedSpan<'a>) -> IResult<T>,
E: ToString,
{
use nom::error::Error as NomError;
move |input| match parser(input) {
Ok((remaining, output)) => Ok((remaining, Some(output))),
Err(nom::Err::Error(NomError { input, code: _ }))
| Err(nom::Err::Failure(NomError { input, code: _ })) => {
let err = LexerError(input, err_msg.to_string());
// TODO Report error.
println!("error: {:?}", err);
Ok((input, None))
}
Err(err) => Err(err),
}
}
fn lit_str_unicode_char(input: LocatedSpan) -> IResult<char> {
let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());
// FIXME Figure out a way to keep correct span here.
let parse_delim_hex = preceded(
char('u'),
delimited(
char('{'),
expect(parse_hex, "expected 1-6 hex digits"),
expect(char('}'), "expected closing brace"),
),
);
let parse_u32 = map_res(parse_delim_hex, move |hex| match hex {
None => Err("cannot parse number"),
Some(hex) => match u32::from_str_radix(hex.fragment(), 16) {
Ok(val) => Ok(val),
Err(_) => Err("invalid number"),
},
});
map_opt(parse_u32, std::char::from_u32)(input)
}
fn main() {
let raw = "u{61}";
let span = LocatedSpan::new(raw);
let result = lit_str_unicode_char(span);
println!("{:#?}", result);
}
This works correctly, I am able to get the Unicode character out of the string. However, this approach does not keep the proper spans, i.e.:
u{123}
\..../ <--- the span I want
\/ <--- the span I get
I figured I could wrap the parse_delim_hex
in a recognize
, which would keep the span correctly, but then I couldn't use the following parsers to "understand" the digits.
How should I get around this issue?
Upvotes: 1
Views: 358
Reputation: 22601
I think you misunderstand the purpose of the first parameter of IResult
.
Quote from the documentation:
The Ok side is a pair containing the remainder of the input (the part of the data that was not parsed) and the produced value.
The span
you are looking at is not the data that was found, but instead the data that was left over afterwards.
I think what you were trying to achieve is something along those lines:
use nom::bytes::complete::take_while_m_n;
use nom::character::complete::char;
use nom::combinator::{map_opt, map_res};
use nom::{InputTake, Offset};
use nom::sequence::{delimited, preceded};
pub type LocatedSpan<'a> = nom_locate::LocatedSpan<&'a str>;
pub type IResult<'a, T> = nom::IResult<LocatedSpan<'a>, T>;
#[derive(Clone, Debug)]
pub struct LexerError<'a>(LocatedSpan<'a>, String);
fn expect<'a, F, E, T>(
mut parser: F,
err_msg: E,
) -> impl FnMut(LocatedSpan<'a>) -> IResult<Option<T>>
where
F: FnMut(LocatedSpan<'a>) -> IResult<T>,
E: ToString,
{
use nom::error::Error as NomError;
move |input| match parser(input) {
Ok((remaining, output)) => Ok((remaining, Some(output))),
Err(nom::Err::Error(NomError { input, code: _ }))
| Err(nom::Err::Failure(NomError { input, code: _ })) => {
let err = LexerError(input, err_msg.to_string());
// TODO Report error.
println!("error: {:?}", err);
Ok((input, None))
}
Err(err) => Err(err),
}
}
fn lit_str_unicode_char(input: LocatedSpan) -> IResult<(char, LocatedSpan)> {
let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());
// FIXME Figure out a way to keep correct span here.
let parse_delim_hex = preceded(
char('u'),
delimited(
char('{'),
expect(parse_hex, "expected 1-6 hex digits"),
expect(char('}'), "expected closing brace"),
),
);
let parse_u32 = map_res(parse_delim_hex, |hex| match hex {
None => Err("cannot parse number"),
Some(hex) => match u32::from_str_radix(hex.fragment(), 16) {
Ok(val) => Ok(val),
Err(_) => Err("invalid number"),
},
});
// Do the actual parsing
let (s, ch) = map_opt(parse_u32, std::char::from_u32)(input)?;
let span_offset = input.offset(&s);
let span = input.take(span_offset);
Ok((s, (ch, span)))
}
fn main() {
let span = LocatedSpan::new("u{62} bbbb");
let (rest, (ch, span)) = lit_str_unicode_char(span).unwrap();
println!("Leftover: {:?}", rest);
println!("Character: {:?}", ch);
println!("Parsed Span: {:?}", span);
}
Leftover: LocatedSpan { offset: 5, line: 1, fragment: " bbbb", extra: () }
Character: 'b'
Parsed Span: LocatedSpan { offset: 0, line: 1, fragment: "u{62}", extra: () }
Upvotes: 2