Reputation: 51
I have a string like this:
"CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567"
Now ": "
splits key from value while \n
separates the pairs. I want to add the key-value pairs to a map in C++.
Is there any efficient way of doing this considering optimization in mind?
Upvotes: 4
Views: 25060
Reputation: 48625
Well I have two methods here. The first one is the easy, obvious method that I use all the time (performance is rarely an issue). The second method is likely more efficient but I have not done any formal timings.
In my tests the second method is about 3 times faster.
#include <map>
#include <string>
#include <sstream>
#include <iostream>
std::map<std::string, std::string> mappify1(std::string const& s)
{
std::map<std::string, std::string> m;
std::string key, val;
std::istringstream iss(s);
while(std::getline(std::getline(iss, key, ':') >> std::ws, val))
m[key] = val;
return m;
}
std::map<std::string, std::string> mappify2(std::string const& s)
{
std::map<std::string, std::string> m;
std::string::size_type key_pos = 0;
std::string::size_type key_end;
std::string::size_type val_pos;
std::string::size_type val_end;
while((key_end = s.find(':', key_pos)) != std::string::npos)
{
if((val_pos = s.find_first_not_of(": ", key_end)) == std::string::npos)
break;
if((val_end = s.find('\n', val_pos)) == std::string::npos)
val_end = s.size();
m.emplace(s.substr(key_pos, key_end - key_pos), s.substr(val_pos, val_end - val_pos));
key_pos = val_end + 1;
if(val_end == s.size())
break;
}
return m;
}
int main()
{
std::string s = "CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567";
std::cout << "mappify1: " << '\n';
auto m = mappify1(s);
for(auto const& p: m)
std::cout << '{' << p.first << " => " << p.second << '}' << '\n';
std::cout << "mappify2: " << '\n';
m = mappify2(s);
for(auto const& p: m)
std::cout << '{' << p.first << " => " << p.second << '}' << '\n';
}
Output:
mappify1:
{CA => ABCD}
{CB => ABFG}
{CC => AFBV}
{CD => 4567}
mappify2:
{CA => ABCD}
{CB => ABFG}
{CC => AFBV}
{CD => 4567}
Upvotes: 9
Reputation: 253
Have looked through the accepted answer and tried to extend a bit which seems to work in more general cases. The test run can be found here. All kind of comments or modification are welcome.
#include <iostream>
#include <string>
#include <sstream>
#include <map>
#include <algorithm>
#include <vector>
size_t find(const std::string& line, std::vector<std::string> vect, int pos=0) {
int eol1;
eol1 = 0;
for (std::vector<std::string>::iterator iter = vect.begin(); iter != vect.end(); ++iter) {
//std::cout << *iter << std::endl;
int eol2 = line.find(*iter, pos);
if (eol1 == 0 && eol2 > 0)
eol1 = eol2;
else if (eol2 > 0 && eol2 < eol1)
eol1 = eol2;
}
return eol1;
}
std::map<std::string, std::string> mappify(std::string const& s, char delim='=') {
std::map<std::string, std::string> m;
std::string::size_type key_pos = 0, i, j;
std::string::size_type key_end;
std::string::size_type val_pos;
std::string::size_type lim_pos;
std::string::size_type val_end;
while ((key_end = s.find(delim, key_pos)) != std::string::npos) {
if ((val_pos = s.find_first_not_of(delim, key_end + 1)) == std::string::npos)break;
while (key_end - 1 > 0 && (s[key_end - 1] <= 32 || s[key_end - 1] == ';'))
key_end--;
while (val_pos < s.size() && (s[val_pos] <= 32 || s[val_pos] == ';'))
val_pos++;
val_end = s.find('\n', val_pos);
i = s.find('\"', val_pos);
if (i != std::string::npos)
j = s.find('\"', i + 1);
else
j = 0;
lim_pos = find(s.substr(0, i), { " ",";","\t" }, val_pos + 1);
//std::cout << "s.substr(j):" << s.substr(j)<<std::endl;
if (lim_pos == 0 && j != std::string::npos)lim_pos = find(s.substr(j), { " ",";","\t" }) + j;
if (lim_pos < val_pos)lim_pos = val_pos + 1;
if (j > 0)val_end = j + 1;
if (val_end > lim_pos)val_end = lim_pos;
m.emplace(s.substr(key_pos, key_end - key_pos), s.substr(val_pos, val_end - val_pos));
key_pos = val_end;
while ((key_pos < s.size() && s[key_pos] <= 32 || s[key_pos] == ';'))
++key_pos;
if (val_end == 0)break;
}
return m;
}
int main() {
std::string s ="\
File=\"c:\\dir\\ocean\\\nCCS_test.txt\"\n\
iEcho=10000; iHrShift=0 rho_Co2 = 1.15d0;\n\
Liner=01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890";
auto m = mappify(s);
for (auto const& p : m)
std::cout << '{' << p.first << " :=> " << p.second << '}' << '\n';
return 0;
}
Upvotes: 0
Reputation: 1385
void splitString(std::map<std::string, std::string> &mymap, const std::string &text, char sep)
{
int start = 0, end1 = 0, end2 = 0;
while ((end1 = text.find(sep, start)) != std::string::npos && (end2 = text.find(sep, end1+1)) != std::string::npos) {
std::string key = text.substr(start, end1 - start);
std::string val = text.substr(end1 + 1, end2 - end1 - 1);
mymap.insert(std::pair<std::string,std::string>(key, val));
start = end2 + 1;
}
}
For example:
std::string text = "key1;val1;key2;val2;key3;val3;";
std::map<std::string, std::string> mymap;
splitString(mymap, text, ';');
Will result in a map of size 3: { key1="val1", key2="val2", key3="val3" }
More examples:
"key1;val1;key2;" => {key1="val1"} (no 2nd val, so 2nd key doesn't count)
"key1;val1;key2;val2" => {key1="val1"} (no delim at end of the 2nd val, so it doesn't count)
"key1;val1;key2;;" => {key1="val1",key2=""} (key2 holds empty string)
Upvotes: 0
Reputation: 131
A very simple solution using boost is the following, it works also with partial tokens (e.g. key without values or empty pairs).
#include <string>
#include <list>
#include <map>
#include <iostream>
#include <boost/foreach.hpp>
#include <boost/algorithm/string.hpp>
using namespace std;
using namespace boost;
int main() {
string s = "CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567";
list<string> tokenList;
split(tokenList,s,is_any_of("\n"),token_compress_on);
map<string, string> kvMap;
BOOST_FOREACH(string token, tokenList) {
size_t sep_pos = token.find_first_of(": ");
string key = token.substr(0,sep_pos);
string value = (sep_pos == string::npos ? "" : token.substr(sep_pos+2,string::npos));
kvMap[key] = value;
cout << "[" << key << "] => [" << kvMap[key] << "]" << endl;
}
return 0;
}
Upvotes: 0
Reputation: 11
If worried about performance, you should probably rethink the need for the end result to be a map. That could end up being a lot of char buffers in memory. Ideally keeping track of just the char* and length of each sub string will be faster/smaller.
Upvotes: 1
Reputation: 11317
I doubt you should worry about optimization for reading this string and converting it in a std::map
. If you really want to optimize this fixed-content map, change it to a std::vector<std::pair<>>
and sort it once.
That said, the most elegant way of creating the std::map
with standard C++ features is the following:
std::map<std::string, std::string> deserializeKeyValue(const std::string &sz) {
constexpr auto ELEMENT_SEPARATOR = ": "s;
constexpr auto LINE_SEPARATOR = "\n"s;
std::map<std::string, std::string> result;
std::size_t begin{0};
std::size_t end{0};
while (begin < sz.size()) {
// Search key
end = sz.find(ELEMENT_SEPARATOR, begin);
assert(end != std::string::npos); // Replace by error handling
auto key = sz.substr(begin, /*size=*/ end - begin);
begin = end + ELEMENT_SEPARATOR.size();
// Seach value
end = sz.find(LINE_SEPARATOR, begin);
auto value = sz.substr(begin, end == std::string::npos ? std::string::npos : /*size=*/ end - begin);
begin = (end == std::string::npos) ? sz.size() : end + LINE_SEPARATOR.size();
// Store key-value
[[maybe_unused]] auto emplaceResult = result.emplace(std::move(key), std::move(value));
assert(emplaceResult.second); // Replace by error handling
}
return result;
}
The performance of this might not be ideal, though every c++ programmer understands this code.
Upvotes: 0
Reputation: 13510
Here is a solution, using strtok
as a splitting means. Please note that strtok
changes your string, it puts '\0' at the split char.
#include <iostream>
#include <string>
#include <map>
#include <string.h>
using namespace std;
int main (int argc, char *argv[])
{
char s1[] = "CA: ABCD\nCB: ABFG\nCC: AFBV\nCD: 4567";
map<string, string> mymap;
char *token;
token = strtok(s1, "\n");
while (token != NULL) {
string s(token);
size_t pos = s.find(":");
mymap[s.substr(0, pos)] = s.substr(pos + 1, string::npos);
token = strtok(NULL, "\n");
}
for (auto keyval : mymap)
cout << keyval.first << "/" << keyval.second << endl;
return 0;
}
Upvotes: 0
Reputation: 18864
This format is called "Tag-Value".
The most performance critical place where such encoding is used in the industry is probably financial FIX Protocol (=
for key-value separator, and '\001'
as entries delimiter). So if you are on x86 hardware then your best bet would be to google 'SSE4 FIX protocol parser github' and reuse the open sourced findings of HFT shops.
If you still want to delegate the vectorization part to the compiler and can spare few nanoseconds for readability then the most elegant solution is to store the result in a std::string
(data) + boost::flat_map<boost::string_ref, boost::string_ref>
(view). Parsing is a matter of taste, while-loop or strtok would be easiest for the compiler to parse. Boost-spirit based parser would be easiest for a human (familiar with boost-spirit) to read.
C++ for-loop based solution
#include <boost/container/flat_map.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/range/iterator_range_io.hpp>
#include <iostream>
// g++ -std=c++1z ~/aaa.cc
int main()
{
using range_t = boost::iterator_range<std::string::const_iterator>;
using map_t = boost::container::flat_map<range_t, range_t>;
char const sep = ':';
char const dlm = '\n';
// this part can be reused for parsing multiple records
map_t result;
result.reserve(1024);
std::string const input {"hello:world\n bye: world"};
// this part is per-line/per-record
result.clear();
for (auto _beg = begin(input), _end = end(input), it = _beg; it != _end;)
{
auto sep_it = std::find(it, _end, sep);
if (sep_it != _end)
{
auto dlm_it = std::find(sep_it + 1, _end, dlm);
result.emplace(range_t {it, sep_it}, range_t {sep_it + 1, dlm_it});
it = dlm_it + (dlm_it != _end);
}
else throw std::runtime_error("cannot parse");
}
for (auto& x: result)
std::cout << x.first << " => " << x.second << '\n';
return 0;
}
Upvotes: 2
Reputation: 126817
The format is simple enough that doing the parsing "by hand" IMO is the best option, overall remains quite readable.
This should also be reasonably efficient (the key
and value
strings are always the same - albeit clear
ed, so the reallocations inside the main loop should just stop after a few iterations); ret
also should qualify for NRVO, OTOH in case of problems with that you can always change to an output parameter.
Of course std::map
may not be the fastest gun in the west, but it's a request in the problem text.
std::map<std::string, std::string> parseKV(const std::string &sz) {
std::map<std::string, std::string> ret;
std::string key;
std::string value;
const char *s=sz.c_str();
while(*s) {
// parse the key
while(*s && *s!=':' && s[1]!=' ') {
key.push_back(*s);
++s;
}
// if we quit due to the end of the string exit now
if(!*s) break;
// skip the ": "
s+=2;
// parse the value
while(*s && *s!='\n') {
value.push_back(*s);
++s;
}
ret[key]=value;
key.clear(); value.clear();
// skip the newline
++s;
}
return ret;
}
Upvotes: 1