Reputation: 7232
I am working on c++ string literal parser with boost spirit.
This is what I have so far:
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/home/support/iterators/line_pos_iterator.hpp>
#include <boost/spirit/repository/include/qi_confix.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
using namespace boost::spirit;
#include <boost/fusion/include/adapt_struct.hpp>
////////////////////////////////
// extra facilities
struct get_line_f
{
template <typename> struct result { typedef size_t type; };
template <typename It> size_t operator()(It const& pos_iter) const
{
return get_line(pos_iter);
}
};
namespace boost { namespace spirit { namespace traits
{
template <>
struct transform_attribute<uint16_t, std::string, qi::domain>
{
typedef std::string& type;
static std::string pre(uint16_t& d) { return "pre16"; }
static void post(uint16_t& val, std::string& attr) { attr = "unicode16"; }
static void fail(uint16_t&) {}
};
}}}
namespace boost { namespace spirit { namespace traits
{
template <>
struct transform_attribute<uint32_t, std::string, qi::domain>
{
typedef std::string& type;
static std::string pre(uint32_t& d) { return "pre32"; }
static void post(uint32_t& val, std::string& attr) { attr = "unicode32"; }
static void fail(uint32_t&) {}
};
}}}
//
////////////////////////////////
struct RangePosition
{
RangePosition()
: beginLine(-1)
, endLine(-1)
{
}
size_t beginLine;
size_t endLine;
};
struct String : public RangePosition
{
String()
: RangePosition()
, value()
, source()
{
}
std::string value;
std::string source;
};
BOOST_FUSION_ADAPT_STRUCT(String,
(std::string, value)
(std::string, source)
(size_t, beginLine)
(size_t, endLine)
)
template <typename Iterator>
struct source_string : qi::grammar<Iterator, String(), qi::space_type>
{
struct escape_symbols : qi::symbols<char, char>
{
escape_symbols()
{
add
("\\\'" , '\'')
("\\\"" , '\"')
("\\\?" , '\?')
("\\\\" , '\\')
("\\0" , '\0')
("\\a" , '\a')
("\\b" , '\b')
("\\f" , '\f')
("\\n" , '\n')
("\\r" , '\r')
("\\t" , '\t')
("\\v" , '\v')
;
}
} escape_symbol;
source_string() : source_string::base_type(start)
{
using qi::raw;
using qi::_val;
using qi::_1;
using qi::space;
using qi::omit;
using qi::no_case;
using qi::attr_cast;
using qi::print;
namespace phx = boost::phoenix;
using phx::at_c;
using phx::begin;
using phx::end;
using phx::construct;
using phx::ref;
escape %= escape_symbol;
character %= (no_case["\\x"] >> hex12)
| ("\\" >> oct123)
| escape
| (print - (lit('"') | '\\'));
unicode %= ("\\u" >> attr_cast(hex4))
| ("\\U" >> attr_cast(hex8));
string_section %= '"' >> *(unicode | character) >> '"';
string %= string_section % omit[*space];
start = raw[
string[at_c<0>(_val) = _1]
]
[
at_c<1>(_val) = construct<std::string>(begin(_1), end(_1)),
at_c<2>(_val) = get_line_(begin(_1)),
at_c<3>(_val) = get_line_(end(_1))
]
;
}
boost::phoenix::function<get_line_f> get_line_;
qi::rule<Iterator, String(), qi::space_type> start;
qi::rule<Iterator, std::string()> escape;
qi::uint_parser<char, 16, 1, 2> hex12;
qi::uint_parser<uint16_t, 16, 4, 4> hex4;
qi::uint_parser<uint32_t, 16, 8, 8> hex8;
qi::uint_parser<char, 8, 1, 3> oct123;
qi::rule<Iterator, std::string()> character;
qi::rule<Iterator, std::string()> unicode;
qi::rule<Iterator, std::string()> string_section;
qi::rule<Iterator, std::string()> string;
};
and my testing code is
std::string str[] =
{
"\"\\u1234\\U12345678\"",
"\"te\"\"st\"",
"\"te\" \"st\"",
"\"te\" \n \"st\"",
"\"\"",
"\"\\\"\"",
"\"test\"",
"\"test\" something",
"\"\\\'\\\"\\\?\\\\\\a\\b\\f\\n\\r\\t\\v\"",
"\"\\x61cd\\X3012\\x7z\"",
"\"\\141cd\\06012\\78\\778\"",
"\"te",
"\"te\nst\"",
"\"test\\\"",
"\"te\\st\"",
//
};
typedef line_pos_iterator<std::string::const_iterator> Iterator;
std::ostringstream result;
for (size_t i = 0; i < sizeof(str) / sizeof(str[0]); ++i)
{
source_string<Iterator> g;
Iterator iter(str[i].begin());
Iterator end(str[i].end());
String string;
bool r = phrase_parse(iter, end, g, qi::space, string);
if (r)
result << string.beginLine << "-" << string.endLine << ": " << string.value << " === " << string.source << "\n";
else
result << "Parsing failed\n";
}
Can somebody help me why in this rule:
unicode %= ("\\u" >> attr_cast(hex4))
| ("\\U" >> attr_cast(hex8));
attr_cast does not invoke transform_attribute that I have defined?
namespace boost { namespace spirit { namespace traits
{
template <>
struct transform_attribute<uint16_t, std::string, qi::domain>
{
typedef std::string& type;
static std::string pre(uint16_t& d) { return "pre16"; }
static void post(uint16_t& val, std::string& attr) { attr = "unicode16"; }
static void fail(uint16_t&) {}
};
}}}
namespace boost { namespace spirit { namespace traits
{
template <>
struct transform_attribute<uint32_t, std::string, qi::domain>
{
typedef std::string& type;
static std::string pre(uint32_t& d) { return "pre32"; }
static void post(uint32_t& val, std::string& attr) { attr = "unicode32"; }
static void fail(uint32_t&) {}
};
}}}
Upvotes: 2
Views: 314
Reputation: 392999
Making builtin primitives types behave "strangely" seems like a VeryBadIdea™.
Assuming you just wish to decode I suggest a simpler approach using semantic actions, e.g.
https://github.com/sehe/spirit-v2-json/blob/master/JSON.cpp#L102
char_ = +(
~encoding::char_(L"\"\\")) [ qi::_val += qi::_1 ] |
qi::lit(L"\x5C") >> ( // \ (reverse solidus)
qi::lit(L"\x22") [ qi::_val += L'"' ] | // " quotation mark U+0022
qi::lit(L"\x5C") [ qi::_val += L'\\' ] | // \ reverse solidus U+005C
qi::lit(L"\x2F") [ qi::_val += L'/' ] | // / solidus U+002F
qi::lit(L"\x62") [ qi::_val += L'\b' ] | // b backspace U+0008
qi::lit(L"\x66") [ qi::_val += L'\f' ] | // f form feed U+000C
qi::lit(L"\x6E") [ qi::_val += L'\n' ] | // n line feed U+000A
qi::lit(L"\x72") [ qi::_val += L'\r' ] | // r carriage return U+000D
qi::lit(L"\x74") [ qi::_val += L'\t' ] | // t tab U+0009
qi::lit(L"\x75") // uXXXX U+XXXX
>> _4HEXDIG [ qi::_val += qi::_1 ]
This appears easily adapted to your use case.
Now if you insist, firstly wrap the types (so you don't "redefine" essential types for Spirit) and secondly, customize the container insertion
traits, since std::string
(or rather std::vector<char>
?) is a container type.
I wouldn't recommend this though. I like to keep things "simple" and the logic in one place. Obviously this is a "funny" thing to say when using a parser generator like Spirit, because so much appears to go on "magically" behind the scenes. However, that is the nature of abstraction. I don't think I'd want to "abstract" decoding unicode escapes here: they feel as they belong in the problem domain, not the tooling.
Upvotes: 1