BrianR
BrianR

Reputation: 65

C++ Read large data, parse, then write data

I'm trying to read a large dataset, format it the way I need, and then write it to another file. I'm trying to use C++ over SAS or STATA for the speed advantage. The data file are usually around 10gigabytes. And my current code takes over an hour to run (and then I kill it because I'm sure that something is very inefficient with my code.

Is there a more efficient way to do this? Maybe read the file into memory and then analyze it using the switch statements? (I have 32gb ram linux 64bit). Is it possible that reading, and then writing within the loop slows it down since it is constantly reading, then writing? I tried to read it from one drive, and then write to another in an attempt to speed this up.

Are the switch cases slowing it down?

The process I have now reads the data using getline, uses the switch statement to parse it correctly, and then writes it to my outfile. And repeats for 300 million lines. There are about 10 more cases in the switch statement, but I didn't copy for brevity's sake.

The code is probably very ugly all being in the main function, but I wanted to get it working before I worked on attractiveness.

I've tried using read() but without any success. Please let me know if I need to clarify anything.

Thank you for the help!

 #include <iostream>
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <stdio.h>
 //#include <cstring>
 //#include <boost/algorithm/string.hpp>

 #include <vector>

  using namespace std;
 //using namespace boost;


 struct dataline
{
char type[0];
double second;
short mill;
char event[1];
char ticker[6];
char marketCategory[1];
char financialStatus[1];
int roundLotSize;
short roundLotOnly;
char tradingState[1];
char reserved[1];
char reason[4];
char mpid[4];
char primaryMarketMaker[1];
char primaryMarketMode[1];
char marketParticipantState[1];
unsigned long orderNumber;
char buySell[0];
double shares;
float price;
int executedShares;
double matchNumber;
char printable[1];
double executionPrice;
int canceledShares;
double sharesBig;
double crossPrice;
char crossType[0];
double pairedShares;
double imbalanceShares;
char imbalanceDirection[1];
double fairPrice;
double nearPrice;
double currentReferencePrice;
char priceVariationIndicator[1];
};

  int main () 
{
string a; 
string b;
string c;
string d;
string e;
string f;
string g;
string h;
string k;
string l;
string times;
string smalltimes;

short time;     //counter to keep second filled
short smalltime;    //counter to keep millisecond filled
double N;
double NN;
double NNN;
int length;
char M; 
//vector<> fout;
string line;

ofstream fout ("/media/3tb/test.txt");
ifstream myfile;
myfile.open("S050508-v3.txt");

dataline oneline;

if (myfile.is_open())
    {
    while ( myfile.good() )
        {
        getline (myfile,line);
//      cout << line<<endl;;

        a=line.substr(0,1);
        stringstream ss(a);
        char type;
        ss>>type;


        switch (type)
            { 
            case 'T':
                {
                if (type == 'T')
                    {
                    times=line.substr(1,5);
                    stringstream s(times);
                    s>>time;
                    //oneline.second=time;
                    //oneline.second;
                    //cout<<time<<endl;
                    }
                else
                    {
                    time=time;
                    }
                break;
                }
            case 'M':
                {
                if (type == 'M')
                    {
                    smalltimes=line.substr(1,3);
                    stringstream ss(smalltimes);
                    ss>>smalltime;      //oneline.mill;
                //  cout<<smalltime<<endl;                            //smalltime=oneline.mill;
                    }
                else
                    {
                    smalltime=smalltime;
                    }
                break;
                }


            case 'R':
                {
                oneline.second=time;
                oneline.mill=smalltime;

                a=line.substr(0,1);
                stringstream ss(a);
                ss>>oneline.type;

                b=line.substr(1,6);
                stringstream sss(b);
                sss>>oneline.ticker;

                c=line.substr(7,1);
                stringstream ssss(c);
                ssss>>oneline.marketCategory;

                d=line.substr(8,1);
                stringstream sssss(d);
                sssss>>oneline.financialStatus;

                e=line.substr(9,6);
                stringstream ssssss(e);
                ssssss>>oneline.roundLotSize;

                f=line.substr(15,1);
                stringstream sssssss(f);
                sssssss>>oneline.roundLotOnly;

                *oneline.tradingState=0;
                *oneline.reserved=0;
                *oneline.reason=0;
                *oneline.mpid=0;
                *oneline.primaryMarketMaker=0;
                *oneline.primaryMarketMode=0;
                *oneline.marketParticipantState=0;
                oneline.orderNumber=0;
                *oneline.buySell=0;
                oneline.shares=0;
                oneline.price=0;
                oneline.executedShares=0;
                oneline.matchNumber=0;
                *oneline.printable=0;
                oneline.executionPrice=0;
                oneline.canceledShares=0;
                oneline.sharesBig=0;
                oneline.crossPrice=0;
                *oneline.crossType=0;
                oneline.pairedShares=0;
                oneline.imbalanceShares=0;
                *oneline.imbalanceDirection=0;
                oneline.fairPrice=0;
                oneline.nearPrice=0;
                oneline.currentReferencePrice=0;
                *oneline.priceVariationIndicator=0;

                break;
                }//End Case 
            }//End Switch
            }//end While
    myfile.close();

     }//End If
else cout << "Unable to open file"; 
cout<<"Junk"<<endl;

return 0;
}

UPDATE So I've been trying to use memory map, but now I'm getting a segmentation fault. I've been trying to follow different examples to piece together something that would work for mine. Why would I be getting a segmentation fault? I've taken the first part of my code, which looks like this:

int main (int argc, char** path) 
 {
 long i;
 int fd;
 char *map;
 char *FILEPATH = path;
 unsigned long FILESIZE;
 FILE* fp = fopen(FILEPATH, "/home/brian/Desktop/S050508-v3.txt");
 fseek(fp, 0, SEEK_END);
 FILESIZE = ftell(fp);
 fseek(fp, 0, SEEK_SET);
 fclose(fp);
 fd = open(FILEPATH, O_RDONLY);

 map = (char *) mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);

 char z;
 stringstream ss;

 for (long i = 0; i <= FILESIZE; ++i) 
    {
    z = map[i];
    if (z != '\n') 
        {
        ss << z;
            }
    else 
        {
            // c style tokenizing
            ss.str("");
            }
        }
 if (munmap(map, FILESIZE) == -1) perror("Error un-mmapping the file");
 close(fd);

Upvotes: 2

Views: 2246

Answers (1)

Brian Cain
Brian Cain

Reputation: 14619

The data file are usually around 10gigabytes. ... Are the switch cases slowing it down?

Almost certainly not, smells like you're I/O bound. But you should consider measuring it. Modern CPUs have performance counters which are pretty easy to leverage with the right tools. But let's start to partition the problems into some major domains: I/O to devices, load/store to memory, CPU. You can place some markers in your code where you read a clock in order to understand how long each of the operations are taking. On linux you can use clock_gettime() or the rdtsc instruction to access a clock with higher precision than the OS tick.

Consider mmap/CreateFileMapping, either of which might provide better efficiency/throughput to the pages you're accessing.

Consider large/huge pages if streaming through large amounts of data which has already been paged in.

From the manual for mmap():

Description

mmap() creates a new mapping in the virtual address space of the calling process. The starting address for the new mapping is specified in addr. The length argument specifies the length of the mapping.

Here's an mmap() example:

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

#define FILEPATH "/tmp/mmapped.bin"
#define NUMINTS  (1000)
#define FILESIZE (NUMINTS * sizeof(int))

int main(int argc, char *argv[])
{
    int i;
    int fd;
    int *map;  /* mmapped array of int's */

    fd = open(FILEPATH, O_RDONLY);
    if (fd == -1) {
    perror("Error opening file for reading");
    exit(EXIT_FAILURE);
    }

    map = mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
    if (map == MAP_FAILED) {
    close(fd);
    perror("Error mmapping the file");
    exit(EXIT_FAILURE);
    }

    /* Read the file int-by-int from the mmap
     */
    for (i = 1; i <=NUMINTS; ++i) {
    printf("%d: %d\n", i, map[i]);
    }

    if (munmap(map, FILESIZE) == -1) {
    perror("Error un-mmapping the file");
    }
    close(fd);
    return 0;
}

Upvotes: 1

Related Questions