Reputation: 63
I created this code to read and filter my csv files. It works like I want it to work for small files. But I just tried out a file of size 200k lines and it takes around 4 minutes, which is too long for my use case.
After testing a bit and fixing some quite stupid things I got the time down a little to 3 minutes. I found out about half of the Time is spent reading in the file and half of the Time is spend generating the Result Vector.
Is there any way to Improve the speed of my Programm? Especially the Reading from csv part? I do not really have an Idea at the moment. I'd appreciate any help.
EDIT:The filter is filtering the data by either a timeframe or timeframe and filterword in specific columns and outputting the data into a resulting vector of strings.
My CSV files look like this->
Headers are:
ID;Timestamp;ObjectID;UserID;Area;Description;Comment;Checksum
Data is:
523;19.05.2021 12:15;####;admin;global;Parameter changed to xxx; Comment;x3J2j4
std::ifstream input_file(strComplPath, std::ios::in);
int counter = 0;
while (std::getline(input_file, record))
{
istringstream line(record);
while (std::getline(line, record, delimiter))
{
record.erase(remove(record.begin(), record.end(), '\"'), record.end());
items.push_back(record);
//cout << record;
}
csv_contents[counter] = items;
items.clear();
++counter;
}
for (int i = 0; i < csv_contents.size(); i++) {
string regexline = csv_contents[i][1];
string endtime = time_upper_bound;
string starttime = time_lower_bound;
bool checkline = false;
bool isInRange = false, isLater = false, isEarlier = false;
// Check for faulty Data and replace it with an empty string
for (int oo = 0; oo < 8; oo++) {
if (csv_contents[i][oo].rfind("#", 0) == 0) {
csv_contents[i][oo] = "";
}
}
if ((regex_search(starttime, m, timestampformat) && regex_search(endtime, m, timestampformat))) {
filtertimeboth = true;
}
else if (regex_search(starttime, m, timestampformat)) {
filterfromstart = true;
}
else if (regex_search(endtime, m, timestampformat)) {
filtertoend = true;
}
}
Upvotes: 1
Views: 1156
Reputation: 117643
I'm not sure exactly what the bottleneck is in your program (I copied your code from an earlier version of the question) but you have a lot of regex:es and mix reading records with post processing. I suggest that you create a class
to hold one of these records, called record
, overload operator>>
for record
and then use std::copy_if
from the file with a filter that you can design separately from the reading. Do post processing after you've read the records that passes the filter.
I made a small test and it takes 2 seconds to read 200k records on my old spinning disk while doing filtering. I only used time_lower_bound
and time_upper_bound
to filter and additional checks will of course make it a little slower, but it should not take minutes.
Example:
#include <algorithm>
#include <chrono>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
// the suggested class to hold a record
struct record {
int ID;
std::chrono::system_clock::time_point Timestamp;
std::string ObjectID;
std::string UserID;
std::string Area;
std::string Description;
std::string Comment;
std::string Checksum;
};
// A free function to read a time_point from an `istream`:
std::chrono::system_clock::time_point to_tp(std::istream& is, const char* fmt) {
std::chrono::system_clock::time_point tp{};
// C++20:
// std::chrono::from_stream(is, tp, fmt, nullptr, nullptr);
// C++11 to C++17 version:
std::tm tmtp{};
tmtp.tm_isdst = -1;
if(is >> std::get_time(&tmtp, fmt)) {
tp = std::chrono::system_clock::from_time_t(std::mktime(&tmtp));
}
return tp;
}
// The operator>> overload to read one `record` from an `istream`:
std::istream& operator>>(std::istream& is, record& r) {
is >> r.ID;
r.Timestamp = to_tp(is, ";%d.%m.%Y %H:%M;"); // using the helper function above
std::getline(is, r.ObjectID, ';');
std::getline(is, r.UserID, ';');
std::getline(is, r.Area, ';');
std::getline(is, r.Description, ';');
std::getline(is, r.Comment, ';');
std::getline(is, r.Checksum);
return is;
}
// An operator<< overload to print one `record`:
std::ostream& operator<<(std::ostream& os, const record& r) {
std::ostringstream oss;
oss << r.ID;
{ // I only made a C++11 to C++17 version for this one:
std::time_t time = std::chrono::system_clock::to_time_t(r.Timestamp);
std::tm ts = *std::localtime(&time);
oss << ';' << ts.tm_mday << '.' << ts.tm_mon + 1 << '.'
<< ts.tm_year + 1900 << ' ' << ts.tm_hour << ':' << ts.tm_min << ';';
}
oss << r.ObjectID << ';' << r.UserID << ';' << r.Area << ';'
<< r.Description << ';' << r.Comment << ';' << r.Checksum << '\n';
return os << oss.str();
}
// The reading and filtering part of `main` would then look like this:
int main() { // not "void main()"
std::istringstream time_lower_bound_s("20.05.2019 16:40:00");
std::istringstream time_upper_bound_s("20.05.2021 09:40:00");
// Your time boundaries as `std::chrono::system_clock::time_point`s -
// again using the `to_tp` helper function:
auto time_lower_bound = to_tp(time_lower_bound_s, "%d.%m.%Y %H:%M:%S");
auto time_upper_bound = to_tp(time_upper_bound_s, "%d.%m.%Y %H:%M:%S");
// Verify that the boundaries were parsed ok:
if(time_lower_bound == std::chrono::system_clock::time_point{} ||
time_upper_bound == std::chrono::system_clock::time_point{}) {
std::cerr << "failed to parse boundaries\n";
return 1;
}
std::ifstream is("data"); // whatever your file is called
if(is) {
std::vector<record> recs; // a vector with all the records
// create your filter
auto filter = [&time_lower_bound, &time_upper_bound](const record& r) {
// Only copy those `record`s within the set boundaries.
// You can add additional conditions here too.
return r.Timestamp >= time_lower_bound &&
r.Timestamp <= time_upper_bound;
};
// Copy those records that pass the filter:
std::copy_if(std::istream_iterator<record>(is),
std::istream_iterator<record>{}, std::back_inserter(recs),
filter);
// .. post process `recs` here ...
// print result
for(auto& r : recs) std::cout << r;
}
}
Upvotes: 1
Reputation: 15265
Answer is already given by Ted. I made a solution in the same time. So let me show it additionally.
I created test data with 500k records and all parsing an stuff was done in below 3 seconds on my machine.
Additionally, I also created classes.
Speed will be gained by using std::move
, increasing the input buffer size and using reserve
for the std::vector
.
Please see yet another solution below. I omitted filtering. Ted showed it already.
#include <iostream>
#include <fstream>
#include <iomanip>
#include <string>
#include <ctime>
#include <vector>
#include <chrono>
#include <sstream>
#include <algorithm>
#include <iterator>
constexpr size_t MaxLines = 600'000u;
constexpr size_t NumberOfLines = 500'000u;
const std::string fileName{ "test.csv" };
// Dummy rtoutine for writing a test file
void createFile() {
if (std::ofstream ofs{ fileName }; ofs) {
std::time_t ttt = 0;
for (size_t k = 0; k < NumberOfLines; ++k) {
std::time_t time = static_cast<time_t>(ttt);
ttt += 1000;
ofs << k << ';'
#pragma warning(suppress : 4996)
<< std::put_time(std::localtime(&time), "%d.%m.%Y %H:%M") << ';'
<< k << ';'
<< "UserID" << k << ';'
<< "Area" << k << ';'
<< "Description" << k << ';'
<< "Comment" << k << ';'
<< "Checksum" << k << '\n';
}
}
else std::cerr << "\n*** Error: Could not open '" << fileName << "' for writing\n\n";
}
// We will create a bigger input buffer for our stream
constexpr size_t ifStreamBufferSize = 100'000u;
static char buffer[ifStreamBufferSize];
// Object oriented Model. Class for one record
struct Record {
// Data
long id{};
std::tm time{};
long objectId{};
std::string userId{};
std::string area{};
std::string description{};
std::string comment{};
std::string checkSum{};
// Methods
// Extractor operator
friend std::istream& operator >> (std::istream& is, Record& r) {
// Read one complete line
if (std::string line; std::getline(is, line)) {
// Here we will stor the parts of the line after the split
std::vector<std::string> parts{};
// Convert line to istringstream for further extraction of line parts
std::istringstream iss{ line };
// One part of a line
std::string part{};
bool wrongData = false;
// Split
while (std::getline(iss, part, ';')) {
// Check fpor error
if (part[0] == '#') {
is.setstate(std::ios::failbit);
break;
}
// add part
parts.push_back(std::move(part));
}
// If all was OK
if (is) {
// If we have enough parts
if (parts.size() == 8) {
// Convert parts to target data in record
r.id = std::strtol(parts[0].c_str(), nullptr, 10);
std::istringstream ss{parts[1]};
ss >> std::get_time(& r.time, "%d.%m.%Y %H:%M");
if (ss.fail())
is.setstate(std::ios::failbit);
r.objectId = std::strtol(parts[2].c_str(), nullptr, 10);
r.userId = std::move(parts[3]);
r.area = std::move(parts[4]);
r.description = std::move(parts[5]);
r.comment = std::move(parts[6]);
r.checkSum = std::move(parts[7]);
}
else is.setstate(std::ios::failbit);
}
}
return is;
}
// Simple inserter function
friend std::ostream& operator << (std::ostream& os, const Record& r) {
return os << r.id << " "
#pragma warning(suppress : 4996)
<< std::put_time(&r.time, "%d.%m.%Y %H:%M") << " "
<< r.objectId << " " << r.userId << " " << r.area << " " << r.description << " " << r.comment << " " << r.checkSum;
}
};
// Data will hold all records
struct Data {
// Data part
std::vector<Record> records{};
// Constructor will reserve space to avaoid reallocation
Data() { records.reserve(MaxLines); }
// Simple extractor. Will call Record's exractor
friend std::istream& operator >> (std::istream& is, Data& d) {
// Set bigger file buffer. This is a time saver
is.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);
std::copy(std::istream_iterator<Record>(is), {}, std::back_inserter(d.records));
return is;
}
// Simple inserter
friend std::ostream& operator >> (std::ostream& os, const Data& d) {
std::copy(d.records.begin(), d.records.end(), std::ostream_iterator<Record>(os, "\n"));
return os;
}
};
int main() {
// createFile();
auto start = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
if (std::ifstream ifs{ fileName }; ifs) {
Data data;
// Start time measurement
start = std::chrono::system_clock::now();
// Read and parse complete data
ifs >> data;
// End of time measurement
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start);
std::cout << "\nReading and splitting. Duration: " << elapsed.count() << " ms\n";
// Some debug output
std::cout << "\n\nNumber of read records: " << data.records.size() << "\n\n";
for (size_t k{}; k < 10; ++k)
std::cout << data.records[k] << '\n';
}
else std::cerr << "\n*** Error: Could not open '" << fileName << "' for reading\n\n";
}
And yes, I used "ctime".
Upvotes: 1