Nick
Nick

Reputation: 10499

Parse string of numbers

I need to parse several C-style strings (around 500k) containing 4 floating point numbers separated by a single space character. Following is an example of a single string:

"90292 5879 89042.2576 5879"

I need to store these numbers in two structures representing two points. Considering that the string can be modified while parsed, and that 99.99% of the times the numbers are just unsigned integers, what's the fastest way to do it?

Following is my current implementation:

#include <iostream>
#include <cassert>
#include <chrono>
#include <algorithm>
#include <vector>
#include <string>
using namespace std;
using namespace chrono;



struct PointF
{
    float x;
    float y;
};


void parse_points(char* points, PointF& p1, PointF& p2)
{
    auto start = points;
    const auto end = start + strlen(points);

    // p1.x
    start = std::find(start, end, ' ');
    assert(start < end);
    *start = '\0';
    p1.x = static_cast<float>(atof(points));
    points = start + 1;

    // p1.y
    start = std::find(start, end, ' ');
    assert(start < end);
    *start = '\0';
    p1.y = static_cast<float>(atof(points));
    points = start + 1;

    // p2.x
    start = std::find(start, end, ' ');
    assert(start < end);
    *start = '\0';
    p2.x = static_cast<float>(atof(points));
    points = start + 1;

    // p2.y
    start = std::find(start, end, ' ');
    assert(start == end);
    p2.y = static_cast<float>(atof(points));
}



int main()
{
    const auto n = 500000;
    char points_str[] = "90292 5879 89042.2576 5879";
    PointF p1, p2;

    vector<string> data(n);

    for (auto& s : data)
        s.assign(points_str);

    const auto t0 = system_clock::now();

    for (auto i = 0; i < n; i++)
        parse_points(const_cast<char*>(data[i].c_str()), p1, p2);

    const auto t1 = system_clock::now();
    const auto elapsed = duration_cast<milliseconds>(t1 - t0).count();

    cout << "Elapsed: " << elapsed << " ms" << endl;

    cin.get();
    return 0;
}

Upvotes: 1

Views: 367

Answers (4)

Dmitriy Zapevalov
Dmitriy Zapevalov

Reputation: 1357

Here is my version without strlen but with use of strtok_s. On my machine it takes 1.1sec instead of 1.5sec.

void parse_points(char* points, PointF& p1, PointF& p2)
{
    char *next_token1 = nullptr;

    // p1.x
    points = strtok_s(points, " ", &next_token1);
    p1.x = points ? static_cast<float>(atof(points)) : 0.0f;

    // p1.y
    points = strtok_s(nullptr, " ", &next_token1);
    p1.y = points ? static_cast<float>(atof(points)) : 0.0f;

    // p2.x
    points = strtok_s(nullptr, " ", &next_token1);
    p2.x = points ? static_cast<float>(atof(points)) : 0.0f;

    // p2.y
    points = strtok_s(nullptr, " ", &next_token1);
    p2.y = points ? static_cast<float>(atof(points)) : 0.0f;
}



int main()
{
    const auto n = 500000;
    char points_str[] = "90292 5879 89042.2576 5879";
    PointF p1, p2;

    vector<string> data(n);

    for (auto& s : data)
        s.assign(points_str);

    const auto t0 = system_clock::now();

    for (auto i = 0; i < n; i++)
        parse_points(const_cast<char*>(data[i].c_str()), p1, p2);

    const auto t1 = system_clock::now();
    const auto elapsed = duration_cast<milliseconds>(t1 - t0).count();

    cout << "Elapsed: " << elapsed << " ms" << endl;

    //cin.get();
    return 0;
}

Upvotes: 0

lorro
lorro

Reputation: 10880

I see multiple issues with the code (and it's actually good that you've asked):

  • There's no error handling for the case when there are numbers (NB: as per discussion, you expect 0 in this case)
  • You create the PointF objects twice to be able to pass them
    • You pass them as reference, thus it's not trivial for a human reading the calling code that these are out params.
  • The parser you've created is available in C (albeit you might measure if it's faster or slower)

I'd suggest this: (note that std::experimental::optional<> is equivalent here to boost::optional<>)

#include <iostream>
#include <cstring>
#include <utility>
#include <experimental/optional>

struct PointF
{
    float x;
    float y;
};

std::experimental::optional<std::pair<PointF, PointF>> parse_points(char* pch)
{
    pch = strtok (pch, " ");
    if (pch != NULL)
    {
        float x0 = atof(pch);
        pch = strtok (NULL, " ");
        if (pch != NULL)
        {
            float y0 = atof(pch);
            pch = strtok (NULL, " ");
            if (pch != NULL)
            {
                float x1 = atof(pch);
                pch = strtok (NULL, " ");
                if (pch != NULL)
                {
                    float y1 = atof(pch);
                    PointF p0{x0, y0}, p1{x1, y1};
                    return std::make_pair(p0, p1);
                }
            }
        }
    }
    return std::experimental::nullopt;
}

int main() {
    const char str[] ="90292 5879 89042.2576 5879";
    char* pch0 = new char[sizeof(str)], *pch = pch0;
    memcpy(pch0, str, sizeof(str));

    std::experimental::optional<std::pair<PointF, PointF>> pOpt( parse_points(pch0) );
    if(pOpt)
        std::cout << pOpt->first.x  << " " << pOpt->first.y  << " "
                  << pOpt->second.x << " " << pOpt->second.y << " " << std::endl;
    delete pch;
}

Upvotes: 0

phyxnj
phyxnj

Reputation: 647

You can implement atof that return the pos of space. In this way, you need traverse each string only once.

eg.

char *atof(char *point, float &num) {
  num = 0;
  bool neg = false, dot = false;
  float decimal = 0, mul = 0.1;
  if (*point == '-') {
    neg = true;
    point++;
  } else if (*point == '+') {
    point++;
  }
  while (*point != ' ' && *point) {
    if (*point == '.') {
      dot = true;
    } else {
      if (dot) {
        decimal += (*point - '0') * mul;
        mul *= 0.1;
      } else {
        num = num * 10 + *point - '0';
      }
    }
    point++;
  }
  if (dot) {
    num += decimal;
  }
  if (neg) {
    num = -num;
  }
  return point;
}

Upvotes: -1

Thomas Matthews
Thomas Matthews

Reputation: 57678

Given a string with floating point values, space separated:

const std::string example_input = "90292 5879 89042.2576 5879";

You should profile to see what is faster, reading as floating point:

std::istringstream text_stream(example_input);
std::vector<double> container;
double value;
while (text_stream >> value)
{
  container.push_back(value);
}

Or reading as integers, taking a performance hit if there is an indication of floating point:

std::istringstream text_stream(example_input);
std::vector<double> container;
double value;
signed int int_value;
std::streampos position_before_read = text_stream.tellg();
while (text_stream >> int_value)
{
  // check the next character for possible floating point differences.
  char c;
  text_stream >> c;
  switch (c)
  {
    case '.':
    case 'E': case 'e':
      // Rewind to before the number and read as floating point
      text_stream.seekg(position_before_read);
      text_stream >> value;
      break;
    default:
      value = 1.0 * int_value;
      break;
    }
  container.push_back(value);
  position_before_read = text_stream.tellg();
}

My guess is that the standard libraries are optimized to read floating point, a lot better than the above example and account for all variances of floating point format.

Note: alternatively, you could read the decimals and exponents as integers, if present, then build a floating point value with all three pieces.

Upvotes: 0

Related Questions