Reputation: 209
I am learning to use boost::spirit. To do that, I wanted to create some simple lexer, combine them and then start parsing using spirit. but the result is quite confused:
Here's the lexer:
// #define BOOST_SPIRIT_LEXERTL_DEBUG
#define BOOST_VARIANT_MINIMIZE_SIZE
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>
#include <iostream>
#include <string>
using namespace boost::spirit;
using namespace boost::spirit::ascii;
enum tokenids
{
IDANY = lex::min_token_id + 10,
T_USER,
T_DOMAINLABEL,
T_CRLF
};
template <typename Lexer>
struct sip_token : lex::lexer<Lexer>
{
sip_token()
{
this->self.add_pattern
("ALPHANUM", "[0-9a-zA-Z]")
("MARK", "[-_.!~*'()]")
("UNRESERVED","{ALPHANUM}|{MARK}")
("USER", "({UNRESERVED})+" )
("DOMAINLABEL", "({ALPHANUM})+")
// ("DOMAINLABEL", "{ALPHANUM}|({ALPHANUM}({ALPHANUM}|-)*{ALPHANUM})")
;
this->self.add
("{USER}",T_USER)
("{DOMAINLABEL}", T_DOMAINLABEL)
("\r\n", T_CRLF)
(".", IDANY) // string literals will not be esacped by the library
;
}
};
template <typename Iterator>
struct sip_grammar : qi::grammar<Iterator>
// struct sip_grammar : qi::grammar<Iterator>
{
template <typename TokenDef>
sip_grammar(TokenDef const& tok)
: sip_grammar::base_type(start)
, c(0), w(0), l(0)
{
using boost::phoenix::ref;
using boost::phoenix::size;
using boost::spirit::qi::eol;
start = (
(qi::token(T_DOMAINLABEL))[++ref(c), ++ref(l)]
>> qi::token(T_CRLF) [++ref(w)]
)
;
}
std::size_t c, w, l;
qi::rule<Iterator> start;
};
int main(int argc, char* argv[])
{
typedef lex::lexertl::token<
char const*, boost::mpl::vector<std::string>
> token_type;
typedef std::string::const_iterator str_iterator_type;
typedef lex::lexertl::lexer<token_type> lexer_type;
typedef sip_token<lexer_type>::iterator_type iterator_type;
std::string str;
while (std::getline(std::cin, str))
{
if (str.empty() || str[0] == 'q' || str[0] == 'Q')
break;
else
str += "\r\n";
sip_token<lexer_type> siplexer;
sip_grammar<iterator_type > g(siplexer);
char const* first = str.c_str();
char const* last = &first[str.size()];
/*< Parsing is done based on the the token stream, not the character
stream read from the input. The function `tokenize_and_parse()` wraps
the passed iterator range `[first, last)` by the lexical analyzer and
uses its exposed iterators to parse the toke stream.
>*/
unsigned result = 0;
bool r = lex::tokenize_and_parse(first, last, siplexer, g);
if (r) {
std::cout << "Parsing OK" << g.l << ", " << g.w
<< ", " << g.c << "\n";
}
else {
std::string rest(first, last);
std::cerr << "Parsing failed\n" << "stopped at: \""
<< rest << "\"\n";
}
}
return 0;
}
//]
in code, I add "T_DOMAINLABEL" after "T_USER", T_DOMAINLABEL always gets parsing failure.seems lexer will match T_USER firstly. why is that? does it mean I can't add these similar patterns together?
Upvotes: 2
Views: 283
Reputation: 393789
Well, T_USER matches:
("{USER}",T_USER)
// which is defined as
("USER", "({UNRESERVED})+" )
// which is defined as
("UNRESERVED","{ALPHANUM}|{MARK}")
So, it takes any series of alphanumeric characters (as well as 'marks', which is irrelevant now)
T_DOMAINLABEL matches:
("{DOMAINLABEL}", T_DOMAINLABEL)
// which is defined as
("DOMAINLABEL", "({ALPHANUM})+")
As you can see, any T_DOMAINLABEL token is always a valid T_USER token. So, there is no way it would ever get a T_DOMAINLABEL.
This is not because of "the token not matching", it's a result of tokenizing being eager and not doing backtracking (outside a single token).
Upvotes: 2