Viking
Viking

Reputation: 51

Split string into key-value pairs using C++

I have a string like this:

"CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567"

Now ": " splits key from value while \n separates the pairs. I want to add the key-value pairs to a map in C++.

Is there any efficient way of doing this considering optimization in mind?

Upvotes: 4

Views: 25060

Answers (9)

Galik
Galik

Reputation: 48625

Well I have two methods here. The first one is the easy, obvious method that I use all the time (performance is rarely an issue). The second method is likely more efficient but I have not done any formal timings.

In my tests the second method is about 3 times faster.

#include <map>
#include <string>
#include <sstream>
#include <iostream>

std::map<std::string, std::string> mappify1(std::string const& s)
{
    std::map<std::string, std::string> m;

    std::string key, val;
    std::istringstream iss(s);

    while(std::getline(std::getline(iss, key, ':') >> std::ws, val))
        m[key] = val;

    return m;
}

std::map<std::string, std::string> mappify2(std::string const& s)
{
    std::map<std::string, std::string> m;

    std::string::size_type key_pos = 0;
    std::string::size_type key_end;
    std::string::size_type val_pos;
    std::string::size_type val_end;

    while((key_end = s.find(':', key_pos)) != std::string::npos)
    {
        if((val_pos = s.find_first_not_of(": ", key_end)) == std::string::npos)
            break;

        if((val_end = s.find('\n', val_pos)) == std::string::npos)
            val_end = s.size();

        m.emplace(s.substr(key_pos, key_end - key_pos), s.substr(val_pos, val_end - val_pos));

        key_pos = val_end + 1;

        if(val_end == s.size())
            break;
    }

    return m;
}
 
int main()
{
    std::string s = "CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567";

    std::cout << "mappify1: " << '\n';

    auto m = mappify1(s);
    for(auto const& p: m)
        std::cout << '{' << p.first << " => " << p.second << '}' << '\n';

    std::cout << "mappify2: " << '\n';

    m = mappify2(s);
    for(auto const& p: m)
        std::cout << '{' << p.first << " => " << p.second << '}' << '\n';
}

Output:

mappify1: 
{CA => ABCD}
{CB => ABFG}
{CC => AFBV}
{CD => 4567}
mappify2: 
{CA => ABCD}
{CB => ABFG}
{CC => AFBV}
{CD => 4567}

Upvotes: 9

MathArt
MathArt

Reputation: 253

Have looked through the accepted answer and tried to extend a bit which seems to work in more general cases. The test run can be found here. All kind of comments or modification are welcome.

#include <iostream>
#include <string>
#include <sstream>
#include <map>
#include <algorithm>
#include <vector>

size_t find(const std::string& line, std::vector<std::string> vect, int pos=0) {
    int eol1;
    eol1 = 0;
    for (std::vector<std::string>::iterator iter = vect.begin(); iter != vect.end(); ++iter) {
        //std::cout << *iter << std::endl;
        int eol2 = line.find(*iter, pos);
        if (eol1 == 0 && eol2 > 0)
            eol1 = eol2;
        else if (eol2 > 0 && eol2 < eol1)
            eol1 = eol2;
    }
    return eol1;
}

std::map<std::string, std::string> mappify(std::string const& s, char delim='=') {
    std::map<std::string, std::string> m;

    std::string::size_type key_pos = 0, i, j;
    std::string::size_type key_end;
    std::string::size_type val_pos;
    std::string::size_type lim_pos;
    std::string::size_type val_end;

    while ((key_end = s.find(delim, key_pos)) != std::string::npos) {
        if ((val_pos = s.find_first_not_of(delim, key_end + 1)) == std::string::npos)break;
        while (key_end - 1 > 0 && (s[key_end - 1] <= 32 || s[key_end - 1] == ';'))
            key_end--;
        while (val_pos < s.size() && (s[val_pos] <= 32 || s[val_pos] == ';'))
            val_pos++;
        val_end = s.find('\n', val_pos);
        i = s.find('\"', val_pos);
        if (i != std::string::npos)
            j = s.find('\"', i + 1);
        else
            j = 0;
        lim_pos = find(s.substr(0, i), { " ",";","\t" }, val_pos + 1);
        //std::cout << "s.substr(j):" << s.substr(j)<<std::endl;
        if (lim_pos == 0 && j != std::string::npos)lim_pos = find(s.substr(j), { " ",";","\t" }) + j;
        if (lim_pos < val_pos)lim_pos = val_pos + 1;
        if (j > 0)val_end = j + 1;
        if (val_end > lim_pos)val_end = lim_pos;
        m.emplace(s.substr(key_pos, key_end - key_pos), s.substr(val_pos, val_end - val_pos));
        key_pos = val_end;
        while ((key_pos < s.size() && s[key_pos] <= 32 || s[key_pos] == ';'))
            ++key_pos;
        if (val_end == 0)break;
    }
    return m;
}

int main() {
    std::string s ="\
File=\"c:\\dir\\ocean\\\nCCS_test.txt\"\n\
iEcho=10000; iHrShift=0 rho_Co2 = 1.15d0;\n\
Liner=01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890";
  auto m = mappify(s);
    for (auto const& p : m)
      std::cout << '{' << p.first << " :=> " << p.second << '}' << '\n';

    return 0;
}

Upvotes: 0

brkeyal
brkeyal

Reputation: 1385

void splitString(std::map<std::string, std::string> &mymap, const std::string &text, char sep)
{
    int start = 0, end1 = 0, end2 = 0;
    while ((end1 = text.find(sep, start)) != std::string::npos && (end2 = text.find(sep, end1+1)) != std::string::npos) {
        std::string key = text.substr(start, end1 - start);
        std::string val = text.substr(end1 + 1, end2 - end1 - 1);
        mymap.insert(std::pair<std::string,std::string>(key, val));
        start = end2 + 1;
    }
}

For example:

std::string text = "key1;val1;key2;val2;key3;val3;";
std::map<std::string, std::string> mymap;
splitString(mymap, text, ';');

Will result in a map of size 3: { key1="val1", key2="val2", key3="val3" }

More examples:

"key1;val1;key2;" => {key1="val1"} (no 2nd val, so 2nd key doesn't count)

"key1;val1;key2;val2" => {key1="val1"} (no delim at end of the 2nd val, so it doesn't count)

"key1;val1;key2;;" => {key1="val1",key2=""} (key2 holds empty string)

Upvotes: 0

Alberto Navatta
Alberto Navatta

Reputation: 131

A very simple solution using boost is the following, it works also with partial tokens (e.g. key without values or empty pairs).

#include <string>
#include <list>
#include <map>
#include <iostream>

#include <boost/foreach.hpp>
#include <boost/algorithm/string.hpp>

using namespace std;
using namespace boost;

int main() {

    string s = "CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567";

    list<string> tokenList;
    split(tokenList,s,is_any_of("\n"),token_compress_on);
    map<string, string> kvMap;

    BOOST_FOREACH(string token, tokenList) {
        size_t sep_pos = token.find_first_of(": ");
        string key = token.substr(0,sep_pos);
        string value = (sep_pos == string::npos ? "" : token.substr(sep_pos+2,string::npos));
        kvMap[key] = value;

        cout << "[" << key << "] => [" << kvMap[key] << "]" << endl;
    }

    return 0;
}

Upvotes: 0

Scott Yeager
Scott Yeager

Reputation: 11

If worried about performance, you should probably rethink the need for the end result to be a map. That could end up being a lot of char buffers in memory. Ideally keeping track of just the char* and length of each sub string will be faster/smaller.

Upvotes: 1

JVApen
JVApen

Reputation: 11317

I doubt you should worry about optimization for reading this string and converting it in a std::map. If you really want to optimize this fixed-content map, change it to a std::vector<std::pair<>> and sort it once.

That said, the most elegant way of creating the std::map with standard C++ features is the following:

std::map<std::string, std::string> deserializeKeyValue(const std::string &sz) {
    constexpr auto ELEMENT_SEPARATOR = ": "s;
    constexpr auto LINE_SEPARATOR = "\n"s;

    std::map<std::string, std::string> result;
    std::size_t begin{0};
    std::size_t end{0};
    while (begin < sz.size()) {
        // Search key
        end = sz.find(ELEMENT_SEPARATOR, begin);
        assert(end != std::string::npos); // Replace by error handling
        auto key = sz.substr(begin, /*size=*/ end - begin);
        begin = end + ELEMENT_SEPARATOR.size();

        // Seach value
        end = sz.find(LINE_SEPARATOR, begin);
        auto value = sz.substr(begin, end == std::string::npos ? std::string::npos : /*size=*/ end - begin);
        begin = (end == std::string::npos) ? sz.size() : end + LINE_SEPARATOR.size();

        // Store key-value
        [[maybe_unused]] auto emplaceResult = result.emplace(std::move(key), std::move(value));
        assert(emplaceResult.second); // Replace by error handling
    }
    return result;
}

The performance of this might not be ideal, though every c++ programmer understands this code.

Upvotes: 0

Israel Unterman
Israel Unterman

Reputation: 13510

Here is a solution, using strtok as a splitting means. Please note that strtok changes your string, it puts '\0' at the split char.

#include <iostream>
#include <string>
#include <map>
#include <string.h>

using namespace std;



int main (int argc, char *argv[])
{
    char s1[] = "CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567";
    map<string, string> mymap;
    char *token;

    token = strtok(s1, "\n");
    while (token != NULL) {
        string s(token);
        size_t pos = s.find(":");
        mymap[s.substr(0, pos)] = s.substr(pos + 1, string::npos);
        token = strtok(NULL, "\n");
    }

    for (auto keyval : mymap) 
        cout << keyval.first << "/" << keyval.second << endl;

    return 0;
}

Upvotes: 0

bobah
bobah

Reputation: 18864

This format is called "Tag-Value".

The most performance critical place where such encoding is used in the industry is probably financial FIX Protocol (= for key-value separator, and '\001' as entries delimiter). So if you are on x86 hardware then your best bet would be to google 'SSE4 FIX protocol parser github' and reuse the open sourced findings of HFT shops.

If you still want to delegate the vectorization part to the compiler and can spare few nanoseconds for readability then the most elegant solution is to store the result in a std::string (data) + boost::flat_map<boost::string_ref, boost::string_ref> (view). Parsing is a matter of taste, while-loop or strtok would be easiest for the compiler to parse. Boost-spirit based parser would be easiest for a human (familiar with boost-spirit) to read.

C++ for-loop based solution

#include <boost/container/flat_map.hpp> 
#include <boost/range/iterator_range.hpp>

#include <boost/range/iterator_range_io.hpp> 
#include <iostream>

// g++ -std=c++1z ~/aaa.cc
int main()
{
    using range_t = boost::iterator_range<std::string::const_iterator>;
    using map_t = boost::container::flat_map<range_t, range_t>;

    char const sep = ':';
    char const dlm = '\n';

    // this part can be reused for parsing multiple records
    map_t result;
    result.reserve(1024);

    std::string const input {"hello:world\n bye: world"};

    // this part is per-line/per-record
    result.clear();
    for (auto _beg = begin(input), _end = end(input), it = _beg; it != _end;)
    {
        auto sep_it = std::find(it, _end, sep);
        if (sep_it != _end)
        {
            auto dlm_it = std::find(sep_it + 1, _end, dlm);
            result.emplace(range_t {it, sep_it}, range_t {sep_it + 1, dlm_it});
            it = dlm_it + (dlm_it != _end);
        }
        else throw std::runtime_error("cannot parse");
    }

    for (auto& x: result)
        std::cout << x.first << " => " << x.second << '\n';

    return 0;
}

Upvotes: 2

Matteo Italia
Matteo Italia

Reputation: 126817

The format is simple enough that doing the parsing "by hand" IMO is the best option, overall remains quite readable.

This should also be reasonably efficient (the key and value strings are always the same - albeit cleared, so the reallocations inside the main loop should just stop after a few iterations); ret also should qualify for NRVO, OTOH in case of problems with that you can always change to an output parameter.

Of course std::map may not be the fastest gun in the west, but it's a request in the problem text.

std::map<std::string, std::string> parseKV(const std::string &sz) {
    std::map<std::string, std::string> ret;
    std::string key;
    std::string value;
    const char *s=sz.c_str();
    while(*s) {
        // parse the key
        while(*s && *s!=':' && s[1]!=' ') {
            key.push_back(*s);
            ++s;
        }
        // if we quit due to the end of the string exit now
        if(!*s) break;
        // skip the ": "
        s+=2;
        // parse the value
        while(*s && *s!='\n') {
            value.push_back(*s);
            ++s;
        }
        ret[key]=value;
        key.clear(); value.clear();
        // skip the newline
        ++s;
    }
    return ret;
}

Upvotes: 1

Related Questions