gsf
gsf

Reputation: 7232

c++ boost spirit attr_cast does not invoke the expected tranform_attribute

I am working on c++ string literal parser with boost spirit.

This is what I have so far:

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/home/support/iterators/line_pos_iterator.hpp>
#include <boost/spirit/repository/include/qi_confix.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>

using namespace boost::spirit;

#include <boost/fusion/include/adapt_struct.hpp>

////////////////////////////////
// extra facilities
struct get_line_f
{
    template <typename> struct result { typedef size_t type; };
    template <typename It> size_t operator()(It const& pos_iter) const
    {
        return get_line(pos_iter);
    }
};

namespace boost { namespace spirit { namespace traits
{
    template <>
    struct transform_attribute<uint16_t, std::string, qi::domain>
    {
        typedef std::string& type;
        static std::string pre(uint16_t& d) { return "pre16"; }
        static void post(uint16_t& val, std::string& attr) { attr = "unicode16"; }
        static void fail(uint16_t&) {}
    };
}}}

namespace boost { namespace spirit { namespace traits
{
    template <>
    struct transform_attribute<uint32_t, std::string, qi::domain>
    {
        typedef std::string& type;
        static std::string pre(uint32_t& d) { return "pre32"; }
        static void post(uint32_t& val, std::string& attr) { attr = "unicode32"; }
        static void fail(uint32_t&) {}
    };
}}}

//
////////////////////////////////

struct RangePosition
{
    RangePosition()
        : beginLine(-1)
        , endLine(-1)
    {
    }

    size_t beginLine;
    size_t endLine;
};

struct String : public RangePosition
{
    String()
        : RangePosition()
        , value()
        , source()
    {
    }

    std::string value;
    std::string source;
};

BOOST_FUSION_ADAPT_STRUCT(String,
                            (std::string, value)
                            (std::string, source)
                            (size_t,      beginLine)
                            (size_t,      endLine)
                          )

template <typename Iterator>
struct source_string : qi::grammar<Iterator, String(), qi::space_type>
{
    struct escape_symbols : qi::symbols<char, char>
    {
        escape_symbols()
        {
            add
                ("\\\'"    , '\'')
                ("\\\""    , '\"')
                ("\\\?"    , '\?')
                ("\\\\"    , '\\')
                ("\\0"     , '\0')
                ("\\a"     , '\a')
                ("\\b"     , '\b')
                ("\\f"     , '\f')
                ("\\n"     , '\n')
                ("\\r"     , '\r')
                ("\\t"     , '\t')
                ("\\v"     , '\v')
            ;
        }

    } escape_symbol;

    source_string() : source_string::base_type(start)
    {
        using qi::raw;
        using qi::_val;
        using qi::_1;
        using qi::space;
        using qi::omit;
        using qi::no_case;
        using qi::attr_cast;
        using qi::print;

        namespace phx = boost::phoenix;
        using phx::at_c;
        using phx::begin;
        using phx::end;
        using phx::construct;
        using phx::ref;

        escape %= escape_symbol;

        character %=   (no_case["\\x"] >> hex12)
                     | ("\\"  >> oct123)
                     | escape
                     | (print - (lit('"') | '\\'));

        unicode %=   ("\\u" >> attr_cast(hex4))
                   | ("\\U" >> attr_cast(hex8));

        string_section %= '"' >> *(unicode | character) >> '"';

        string %= string_section % omit[*space];

        start = raw[
                        string[at_c<0>(_val) = _1]
                   ]
                   [
                       at_c<1>(_val) = construct<std::string>(begin(_1), end(_1)),
                       at_c<2>(_val) = get_line_(begin(_1)),
                       at_c<3>(_val) = get_line_(end(_1))
                   ]
        ;
    }

    boost::phoenix::function<get_line_f> get_line_;
    qi::rule<Iterator, String(), qi::space_type> start;
    qi::rule<Iterator, std::string()> escape;
    qi::uint_parser<char, 16, 1, 2> hex12;
    qi::uint_parser<uint16_t, 16, 4, 4> hex4;
    qi::uint_parser<uint32_t, 16, 8, 8> hex8;
    qi::uint_parser<char,  8, 1, 3> oct123;
    qi::rule<Iterator, std::string()> character;
    qi::rule<Iterator, std::string()> unicode;
    qi::rule<Iterator, std::string()> string_section;
    qi::rule<Iterator, std::string()> string;
};

and my testing code is

std::string str[] =
{
    "\"\\u1234\\U12345678\"",

    "\"te\"\"st\"",
    "\"te\"  \"st\"",
    "\"te\" \n \"st\"",
    "\"\"",
    "\"\\\"\"",
    "\"test\"",
    "\"test\" something",
    "\"\\\'\\\"\\\?\\\\\\a\\b\\f\\n\\r\\t\\v\"",
    "\"\\x61cd\\X3012\\x7z\"",
    "\"\\141cd\\06012\\78\\778\"",
    "\"te",
    "\"te\nst\"",
    "\"test\\\"",
    "\"te\\st\"",
    //
};

typedef line_pos_iterator<std::string::const_iterator> Iterator;

std::ostringstream result;

for (size_t i = 0; i < sizeof(str) / sizeof(str[0]); ++i)
{
    source_string<Iterator> g;
    Iterator iter(str[i].begin());
    Iterator end(str[i].end());

    String string;
    bool r = phrase_parse(iter, end, g, qi::space, string);
    if (r)
        result << string.beginLine << "-" << string.endLine << ": " << string.value << " === " << string.source << "\n";
    else
        result << "Parsing failed\n";
}

Can somebody help me why in this rule:

        unicode %=   ("\\u" >> attr_cast(hex4))
                   | ("\\U" >> attr_cast(hex8));

attr_cast does not invoke transform_attribute that I have defined?

namespace boost { namespace spirit { namespace traits
{
    template <>
    struct transform_attribute<uint16_t, std::string, qi::domain>
    {
        typedef std::string& type;
        static std::string pre(uint16_t& d) { return "pre16"; }
        static void post(uint16_t& val, std::string& attr) { attr = "unicode16"; }
        static void fail(uint16_t&) {}
    };
}}}

namespace boost { namespace spirit { namespace traits
{
    template <>
    struct transform_attribute<uint32_t, std::string, qi::domain>
    {
        typedef std::string& type;
        static std::string pre(uint32_t& d) { return "pre32"; }
        static void post(uint32_t& val, std::string& attr) { attr = "unicode32"; }
        static void fail(uint32_t&) {}
    };
}}}

Upvotes: 2

Views: 314

Answers (1)

sehe
sehe

Reputation: 392999

Making builtin primitives types behave "strangely" seems like a VeryBadIdea™.

Assuming you just wish to decode I suggest a simpler approach using semantic actions, e.g.

  • https://github.com/sehe/spirit-v2-json/blob/master/JSON.cpp#L102

    char_ = +(
            ~encoding::char_(L"\"\\")) [ qi::_val += qi::_1 ] |
               qi::lit(L"\x5C") >> (                    // \ (reverse solidus)
               qi::lit(L"\x22") [ qi::_val += L'"'  ] | // "    quotation mark  U+0022
               qi::lit(L"\x5C") [ qi::_val += L'\\' ] | // \    reverse solidus U+005C
               qi::lit(L"\x2F") [ qi::_val += L'/'  ] | // /    solidus         U+002F
               qi::lit(L"\x62") [ qi::_val += L'\b' ] | // b    backspace       U+0008
               qi::lit(L"\x66") [ qi::_val += L'\f' ] | // f    form feed       U+000C
               qi::lit(L"\x6E") [ qi::_val += L'\n' ] | // n    line feed       U+000A
               qi::lit(L"\x72") [ qi::_val += L'\r' ] | // r    carriage return U+000D
               qi::lit(L"\x74") [ qi::_val += L'\t' ] | // t    tab             U+0009
               qi::lit(L"\x75")                         // uXXXX                U+XXXX
                    >> _4HEXDIG [ qi::_val += qi::_1 ]
    

    This appears easily adapted to your use case.

Now if you insist, firstly wrap the types (so you don't "redefine" essential types for Spirit) and secondly, customize the container insertion traits, since std::string (or rather std::vector<char>?) is a container type.

I wouldn't recommend this though. I like to keep things "simple" and the logic in one place. Obviously this is a "funny" thing to say when using a parser generator like Spirit, because so much appears to go on "magically" behind the scenes. However, that is the nature of abstraction. I don't think I'd want to "abstract" decoding unicode escapes here: they feel as they belong in the problem domain, not the tooling.

Upvotes: 1

Related Questions