Reputation: 43
I have a huge amount of files I am trying to parse using boost::spirit::qi. Parsing is not a problem, but some of the files contain noise that I want to skip. Building a simple parser (not using boost::spirit::qi) verifies that I can avoid the noise by skipping anything that doesn't match rules at the beginning of a line. So, I'm looking for a way to write a line based parser that skip lines when not matching any rule.
The example below allows the grammar to skip lines if they don't match at all, but the 'junk' rule still inserts an empty instance of V(), which is unwanted behaviour. The use of \r instead of \n in the example is intentional as I have encountered both \n, \r and \r\n in the files.
#include <iostream>
#include <string>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/std_tuple.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phx = boost::phoenix;
using V = std::tuple<std::string, double, double, double>;
namespace client {
template <typename Iterator>
struct VGrammar : qi::grammar<Iterator, std::vector<V>(), ascii::space_type> {
VGrammar() : VGrammar::base_type(start) {
using namespace qi;
v %= string("v") > double_ > double_ > double_;
junk = +(char_ - eol);
start %= +(v | junk);
v.name("v");
junk.name("junk");
start.name("start");
using phx::val;
using phx::construct;
on_error<fail>(
start,
std::cout
<< val("Error! Expecting \n\n'")
<< qi::_4
<< val("'\n\n here: \n\n'")
<< construct<std::string>(qi::_3, qi::_2)
<< val("'")
<< std::endl
);
//debug(v);
//debug(junk);
//debug(start);
}
qi::rule<Iterator> junk;
//qi::rule<Iterator, qi::unused_type()> junk; // Doesn't work either
//qi::rule<Iterator, qi::unused_type(), qi::unused_type()> junk; // Doesn't work either
qi::rule<Iterator, V(), ascii::space_type> v;
qi::rule<Iterator, std::vector<V>(), ascii::space_type> start;
};
} // namespace client
int main(int argc, char* argv[]) {
using iterator_type = std::string::const_iterator;
std::string input = "";
input += "v 1 2 3\r"; // keep v 1 2 3
input += "o a b c\r"; // parse as junk
input += "v 4 5 6 v 7 8 9\r"; // keep v 4 5 6, but parse v 7 8 9 as junk
input += " v 10 11 12\r\r"; // parse as junk
iterator_type iter = input.begin();
const iterator_type end = input.end();
std::vector<V> parsed_output;
client::VGrammar<iterator_type> v_grammar;
std::cout << "run" << std::endl;
bool r = phrase_parse(iter, end, v_grammar, ascii::space, parsed_output);
std::cout << "done ... r: " << (r ? "true" : "false") << ", iter==end: " << ((iter == end) ? "true" : "false") << std::endl;
if (r && (iter == end)) {
BOOST_FOREACH(V const& v_row, parsed_output) {
std::cout << std::get<0>(v_row) << ", " << std::get<1>(v_row) << ", " << std::get<2>(v_row) << ", " << std::get<3>(v_row) << std::endl;
}
}
return EXIT_SUCCESS;
}
Here's the output from the example:
run
done ... r: true, iter==end: true
v, 1, 2, 3
, 0, 0, 0
v, 4, 5, 6
v, 7, 8, 9
v, 10, 11, 12
And here is what I actually want the parser to return.
run
done ... r: true, iter==end: true
v, 1, 2, 3
v, 4, 5, 6
My main problem right now is to keep the 'junk' rule from adding an empty V() object. How do I accomplish this? Or am I overthinking the problem?
I have tried adding lit(junk) to the start rule, since lit() doesn't return anything, but this will not compile. It fails with: "static assertion failed: error_invalid_expression".
I have also tried to set the semantic action on the junk rule to qi::unused_type() but the rule still creates an empty V() in that case.
I am aware of the following questions, but they don't address this particular issue. I have tried out the comment skipper earlier, but it looks like I'll have to reimplement all the parse rules in the skipper in order to identify noise. My example is inspired by the solution in the last link:
How to skip line/block/nested-block comments in Boost.Spirit?
How to parse entries followed by semicolon or newline (boost::spirit)?
Version info:
Linux debian 4.9.0-7-amd64 #1 SMP Debian 4.9.110-3+deb9u2 (2018-08-13) x86_64 GNU/Linux
g++ (Debian 6.3.0-18+deb9u1) 6.3.0 20170516
#define BOOST_VERSION 106200
and:
Linux raspberrypi 4.14.24-v7+ #1097 SMP Mon Mar 5 16:42:05 GMT 2018 armv7l GNU/Linux
g++ (Raspbian 4.9.2-10+deb8u1) 4.9.2
#define BOOST_VERSION 106200
For those who wonder: yes I'm trying to parse files similar to Wavefront OBJ files and I'm aware that there is already a bunch of parsers available. However, the data I'm parsing is part of a larger data structure which also requires parsing, so it does make sense to build a new parser.
Upvotes: 4
Views: 220
Reputation: 3785
What you are wanting to achieve is called error recover.
Unfortunately, Spirit does not have a nice way of doing it (there are also some internal decisions which makes it hard to make it externally). However, in your case it is simple to achieve by grammar rewrite.
#include <iostream>
#include <string>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/std_tuple.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phx = boost::phoenix;
using V = std::tuple<std::string, double, double, double>;
namespace client {
template <typename Iterator>
struct VGrammar : qi::grammar<Iterator, std::vector<V>()> {
VGrammar() : VGrammar::base_type(start) {
using namespace qi;
v = skip(blank)[no_skip[string("v")] > double_ > double_ > double_];
junk = +(char_ - eol);
start = (v || -junk) % eol;
v.name("v");
junk.name("junk");
start.name("start");
using phx::val;
using phx::construct;
on_error<fail>(
start,
std::cout
<< val("Error! Expecting \n\n'")
<< qi::_4
<< val("'\n\n here: \n\n'")
<< construct<std::string>(qi::_3, qi::_2)
<< val("'")
<< std::endl
);
//debug(v);
//debug(junk);
//debug(start);
}
qi::rule<Iterator> junk;
//qi::rule<Iterator, qi::unused_type()> junk; // Doesn't work either
//qi::rule<Iterator, qi::unused_type(), qi::unused_type()> junk; // Doesn't work either
qi::rule<Iterator, V()> v;
qi::rule<Iterator, std::vector<V>()> start;
};
} // namespace client
int main(int argc, char* argv[]) {
using iterator_type = std::string::const_iterator;
std::string input = "";
input += "v 1 2 3\r"; // keep v 1 2 3
input += "o a b c\r"; // parse as junk
input += "v 4 5 6 v 7 8 9\r"; // keep v 4 5 6, but parse v 7 8 9 as junk
input += " v 10 11 12\r\r"; // parse as junk
iterator_type iter = input.begin();
const iterator_type end = input.end();
std::vector<V> parsed_output;
client::VGrammar<iterator_type> v_grammar;
std::cout << "run" << std::endl;
bool r = parse(iter, end, v_grammar, parsed_output);
std::cout << "done ... r: " << (r ? "true" : "false") << ", iter==end: " << ((iter == end) ? "true" : "false") << std::endl;
if (r && (iter == end)) {
BOOST_FOREACH(V const& v_row, parsed_output) {
std::cout << std::get<0>(v_row) << ", " << std::get<1>(v_row) << ", " << std::get<2>(v_row) << ", " << std::get<3>(v_row) << std::endl;
}
}
return EXIT_SUCCESS;
}
Upvotes: 3
Reputation: 392989
I have tried adding lit(junk) to the start rule, since lit() doesn't return anything, but this will not compile. It fails with: "static assertion failed: error_invalid_expression".
What you're looking for would be omit[junk]
, but it should make no difference because it will still make the synthesized attribute optional<>
.
First of all, you need newlines to be significant. Which means you cannot skip space
. Because it eats newlines. What's worse, you need leading whitespace to be significant as well (to junk that last line, e.g.). You cannot even use qi::blank
for the skipper then. (See Boost spirit skipper issues).
Just so you can still have whitespace inside the v
rule, just have a local skipper (that doesn't eat newlines):
v %= &lit("v") >> skip(blank) [ string("v") > double_ > double_ > double_ ];
It engages the skipper only after establishing that there was no unexpected leading whitespace.
Note that the string("v")
is a bit redundant this way, but that brings us to the second motive:
Second of all, I'm with you in avoiding semantic actions. However, this means you have to make your rules reflect your data structures.
In this particular instance, it means you should probably turn the line skipping a bit inside-out. What if you express the grammar as a straight repeat of v
, interspersed with /whatever/, instead of just /newline/? I'd write that like:
junk = *(char_ - eol);
other = !v >> junk;
start = *(v >> junk >> eol % other);
Note that
operator%
(list operator) itself: (eol % other)
. What this cleverly accomplishes is that it keeps eating newlines as long as they are only delimited by "other" lines (anything !v
at this point).other
is more constrained than junk
, because junk
may eat v
, whereas other
makes sure that never happensv >> junk
allows the third line of your sample to be correctly processed (the line that has v 4 5 6 v 7 8 9\r
)Now it all works: Live On Coliru:
run
done ... r: true, iter==end: true
v, 1, 2, 3
v, 4, 5, 6
You might be aware of the fact that this does not handle the case when the first line(s) are not v
lines. Let's add that case to the sample and make sure it works as well:
//#define BOOST_SPIRIT_DEBUG
#include <iostream>
#include <string>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/std_tuple.hpp>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
using V = std::tuple<std::string, double, double, double>;
namespace client {
template <typename Iterator>
struct VGrammar : qi::grammar<Iterator, std::vector<V>()> {
VGrammar() : VGrammar::base_type(start) {
using namespace qi;
v %= &lit("v") >> skip(blank) [ string("v") > double_ > double_ > double_ ];
junk = *(char_ - eol);
other = !v >> junk;
start =
other >> eol % other >>
*(v >> junk >> eol % other);
BOOST_SPIRIT_DEBUG_NODES((v)(junk)(start))
on_error<fail>(
start,
std::cout
<< phx::val("Error! Expecting \n\n'") << qi::_4
<< "'\n\n here: \n\n'" << phx::construct<std::string>(qi::_3, qi::_2)
<< "'\n"
);
}
private:
qi::rule<Iterator> other, junk;
qi::rule<Iterator, V()> v;
qi::rule<Iterator, std::vector<V>()> start;
};
} // namespace client
int main() {
using iterator_type = std::string::const_iterator;
std::string input = "";
input += "o a b c\r"; // parse as junk
input += "v 1 2 3\r"; // keep v 1 2 3
input += "o a b c\r"; // parse as junk
input += "v 4 5 6 v 7 8 9\r"; // keep v 4 5 6, but parse v 7 8 9 as junk
input += " v 10 11 12\r\r"; // parse as junk
iterator_type iter = input.begin();
const iterator_type end = input.end();
std::vector<V> parsed_output;
client::VGrammar<iterator_type> v_grammar;
std::cout << "run" << std::endl;
bool r = parse(iter, end, v_grammar, parsed_output);
std::cout << "done ... r: " << (r ? "true" : "false") << ", iter==end: " << ((iter == end) ? "true" : "false") << std::endl;
if (iter != end)
std::cout << "Remaining unparsed: '" << std::string(iter, end) << "'\n";
if (r) {
BOOST_FOREACH(V const& v_row, parsed_output) {
std::cout << std::get<0>(v_row) << ", " << std::get<1>(v_row) << ", " << std::get<2>(v_row) << ", " << std::get<3>(v_row) << std::endl;
}
}
return EXIT_SUCCESS;
}
Upvotes: 3