Reputation: 335

C++ what is the best sorting container and approach for large datasets (millions of lines)

I'm tackling a exercise which is supposed to exactly benchmark the time complexity of such code.

The data I'm handling is made up of pairs of strings like this hbFvMF,PZLmRb, each string is present two times in the dataset, once on position 1 and once on position 2 . so the first string would point to zvEcqe,hbFvMF for example and the list goes on....

example dataset of 50k pairs

I've been able to produce code which doesn't have much problem sorting these datasets up to 50k pairs, where it takes about 4-5 minutes. 10k gets sorted in a matter of seconds.

The problem is that my code is supposed to handle datasets of up to 5 million pairs. So I'm trying to see what more I can do. I will post my two best attempts, initial one with vectors, which I thought I could upgrade by replacing vector with unsorted_map because of the better time complexity when searching, but to my surprise, there was almost no difference between the two containers when I tested it. I'm not sure if my approach to the problem or the containers I'm choosing are causing the steep sorting times...

Attempt with vectors:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>

using namespace std;


template<typename T>
void search_bricks_backwards(string resume, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    int temp_index = 0;
    while (true) {
        if (index == vec.size()) {
            vec2.insert(vec2.begin(), vec[temp_index].first); 
            cout << "end of backward search, exitting..." << endl;
            break;


        }
        
        if (vec[index].second == resume) {
            vec2.insert(vec2.begin(), resume);
            

            resume = vec[index].first;
            //vec.erase(vec.begin() + index);
            temp_index = index;

            index = 0;
        }
        
        index++;
    }

}


template<typename T>
void search_bricks(string start, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    int temp_index = 0;
    while (true) {
        //cout << "iteration " << index << endl;
        if (index == vec.size()) {
            vec2.push_back(vec[temp_index].second);
            
            cout << "all forward bricks sorted" << endl;
            break;


        }
        if (vec[index].first == start) {
            vec2.push_back(vec[index].first);
            
            
            start = vec[index].second;
            //vec.erase(vec.begin() + index);
            temp_index = index;
            index = 0;
            
        }
        
        index++;
    }

    search_bricks_backwards(vec[0].first, vec, vec2);

}

template<typename T>
void search_bricks_recursion(string start, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    for (const auto& pair : vec) {
        //cout << "iteration " << index << endl;
        if (pair.first == start) {
            vec2.push_back(start);
            cout << "found " << start << " and " << pair.first << endl;
            search_bricks(pair.second, vec, vec2);
        }
        if (index + 1 == vec.size()) {
            search_bricks_backwards(start, vec, vec2);
            

        }
        index++;
    }
    
}

template<typename T>
void printVectorElements(vector<T>& vec)
{
    for (auto i = 0; i < vec.size(); ++i) {
        cout << "(" << vec.at(i).first << ","
            << vec.at(i).second << ")" << " ";
    }
    cout << endl;
}

vector<string> split(string s, string delimiter) {
    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
    string token;
    vector<string> res;

    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
        token = s.substr(pos_start, pos_end - pos_start);
        pos_start = pos_end + delim_len;
        res.push_back(token);
    }

    res.push_back(s.substr(pos_start));
    return res;
}

unordered_map<string, string> brick_to_map(string const& s)
{
    unordered_map<string, string> m;

    string key, val;
    istringstream iss(s);

    while (getline(getline(iss, key, ',') >> ws, val))
        m[key] = val;

    return m;
}

int main()
{
    vector<pair<string, string>> bricks;
    
    vector<string> sorted_bricks;

    ifstream inFile;
    inFile.open("input-pairs-50K.txt"); //open the input file

    stringstream strStream;
    strStream << inFile.rdbuf(); //read the file
    string str = strStream.str(); //str holds the content of the file

    //cout << str << endl;
    
    istringstream iss(str);
    
    for (string line; getline(iss, line); )
    {
     
        string delimiter = ",";
        string s = line;
        vector<string> v = split(s, delimiter);
        string s1 = v.at(0);
        string s2 = v.at(1);
        

        bricks.push_back(make_pair(s1, s2));
    }

   
    search_bricks(bricks[0].second, bricks, sorted_bricks);
    
    
    //display the results
    for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
        cout << *i << " ";


    
 
}

Attempt with unsorted map:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>

using namespace std;


void search_bricks_backwards(string resume, unordered_map<string, string> brick_map, vector<string>& vec2) {
    
    typedef unordered_map<string, string>::value_type map_value_type;
    while (true) {

        unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&resume](const map_value_type& vt)
            { return vt.second == resume; }
        );
        if (got == brick_map.end()) {
            vec2.insert(vec2.begin(), resume); 
            cout << "end of backward search, exitting..." << endl;
            break;


        }
        //cout << "iteration " << index << endl;
        else if (got->second == resume) {
            vec2.insert(vec2.begin(), resume);

            
            resume = got->first;
        
        }

       
    }

}


void search_bricks(string start, unordered_map<string, string> brick_map, vector<string>& vec2) {
    
    typedef unordered_map<string, string>::value_type map_value_type;
    while (true) {
        

        unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&start](const map_value_type& vt)
            { return vt.first == start; }
        );
        if (got == brick_map.end()) {
            vec2.push_back(start);

            cout << "all forward bricks sorted" << endl;
            
            break;
        }
        else if (got->first == start) {
            vec2.push_back(start);

            //cout << "found " << start << " and " << vec[index].first << endl;
            start = got->second;
            
        }
    }
    auto it = brick_map.begin();
    search_bricks_backwards(it->first, brick_map, vec2);
    


    

    

}


template<typename T>
void printVectorElements(vector<T>& vec)
{
    for (auto i = 0; i < vec.size(); ++i) {
        cout << "(" << vec.at(i).first << ","
            << vec.at(i).second << ")" << " ";
    }
    cout << endl;
}

vector<string> split(string s, string delimiter) {
    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
    string token;
    vector<string> res;

    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
        token = s.substr(pos_start, pos_end - pos_start);
        pos_start = pos_end + delim_len;
        res.push_back(token);
    }

    res.push_back(s.substr(pos_start));
    return res;
}


int main()
{
    unordered_map<string, string> bricks;
    
    vector<string> sorted_bricks;

    ifstream inFile;
    inFile.open("input-pairs-50K.txt"); //open the input file

    for (string line; getline(inFile, line); )
    {

        string delimiter = ",";
        string s = line;
        vector<string> v = split(s, delimiter);
        string s1 = v.at(0);
        string s2 = v.at(1);


        bricks.insert(make_pair(s1, s2));
    }


    /*for (auto& x : bricks)
        std::cout << x.first << "," << x.second << " ";*/


    auto it = bricks.begin();
    search_bricks(it->second, bricks, sorted_bricks);


    // display results
    for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
        cout << *i << " ";




}

I'm looking to improve the time complexity of my code to be able to process the data more eficiently, if anyone can suggest what to improve in my code or container wise I'd be very thankful.

Upvotes: 5

Answers (3)

A M

Reputation: 15277

Late answer, but with minor improvements we can speed up the processing time by 100%. Or reduce the runtime by half.

Basically the approach chosen by WhozCraig is already nearly optimal.

I could improve the performance by nearly 100%, by using std::string_view and selecting a different string split approach. Most the time is consumed with reading the source file and splitting the data. With my different selected approach, I could save the most time. I added some additional small improvements.

I checked both algorithms with a big file with 5.000.000 pairs. I created a module, to generate such a test file. Please have a look in the below source code.

I will show my source code and the code given be "WhozCraig", slighty modified for benchmark measurements.

But first, please see the benchmark results as screenshots.

Test with the original small file given in the quesion. My solution:

Solution of "WhozCraig" with small test file

Then I created a 5M pairs test file and run the same test. My approach

Solution of "WhozCraig" with big test file

Please note. Sorting big data with "WhozCraig"'s approach is even faster.

But reading is slower.

I did not investigate why.

Of course, all this is machine and compiler dependent. I ran the test on a 10 years old Windows 7 machine. The compiler is Visual Studio 2019 community editions. Compiled in release mode.

Please see code examples below:

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <unordered_map>
#include <random>
#include <string_view>
#include <deque>
#include <iterator>
#include <unordered_set>
#include <algorithm>
#include <array>
#include <utility>

// ----------------------------------------------------------------------------------------------------------------
// Generate test data
// We will generate that many strings
constexpr size_t MaxPairs = 5'000'000u;

// The strings will all be 6 characters long and consist of upper- and lowercase letters
constexpr size_t StringLength = 6u;
constexpr std::array<char, 52> Letter{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

// Filename for test data
const std::string testFileName{"test.txt"};

// Create a file with test date following the given pattern
void createTestData() {

    // We need random numbers to select a letter from above string
    std::random_device randomDevice;
    std::mt19937 randomGenerator(randomDevice());
    std::uniform_int_distribution<> randomIntDistribution(0, Letter.size() - 1);

    // Function to create a random string
    auto generateString = [&]() -> std::string
    {
        std::string s(6, 0);
        for (size_t i{}; i < StringLength; ++i) 
            s[i] = Letter[randomIntDistribution(randomGenerator)]; 
        return s; };

    // We want to have unique, not sorted strings. So, use a std::unordered_set
    std::unordered_set<std::string> uniqueStrings{};

    // Now generate the requested numbers of unique strings
    while (uniqueStrings.size() < MaxPairs + 1) 
        uniqueStrings.insert(generateString());
    
    // Move them to a std::vector for random access possibility
    std::vector<std::string> sss(std::make_move_iterator(uniqueStrings.begin()), std::make_move_iterator(uniqueStrings.end()));

    // Now we create a vectore of pair of indices, like 0->1, 1->2, 2->3 and so on
    std::vector<std::pair<int, int>> index(MaxPairs);
    for (int i = 0; i < MaxPairs; ++i) {
        index[i].first = i;
        index[i].second = i + 1;
    }
    // This we will shuffle
    std::shuffle(index.begin(), index.end(), randomGenerator);

    // And then store the data in a text file
    if (std::ofstream ofs{ testFileName }; ofs) {
        for (size_t i = 0; i < MaxPairs; ++i)
            ofs << sss[index[i].first] << ',' << sss[index[i].second] << '\n';
    }
}

// We make the bucket size 3 times bigger then the number of elements.
// Maybe, but actually I do not know, this will reduce collisions
constexpr int MaxBucket = 16'777'216u;

// A very small speed boost by a bigger input buffer
constexpr size_t ifStreamBufferSize = 1'000'000u;
static char buffer[ifStreamBufferSize];

// And a more simple and faster hash funcion
struct MyHash {
    size_t operator()(const std::string_view& s) const {
        size_t hash = 0;
        for (const char c : s) hash = hash * 101 + c;
        return hash;
    }
};

int main() {
    // Please uncomment, if you want to create test Data
    //createTestData();

    // Start runtime measurement
    auto tp1 = std::chrono::high_resolution_clock::now();

    // Open source file and check if it could be opened
    if (std::ifstream ifs{ testFileName }; ifs) {

        // Set bigger file read buffer. Not sure, if it has really an effect.
        ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);

        // Read complete source file into a string
        std::string text(std::istreambuf_iterator<char>(ifs), {});

        // We ant to identify the pair. A pair of a relation consists of a left and right part. Start of course with left
        bool readLeftSideOfRelation = true;

        // Start value for data analysis
        char* currentCharPtr = text.data();
        // End of string
        const char* const textEndPtr = currentCharPtr + text.length();

        // Define string views for the left and the right side of the relation 
        std::string_view firstView(currentCharPtr, StringLength);
        std::string_view secondView;

        // This is the simulation of a bimap
        std::unordered_map<std::string_view, std::string_view, MyHash> map1{};
        std::unordered_map<std::string_view, std::string_view, MyHash> map2{};
        // A liitle bit speed gain by reservin buffer space
        map1.reserve(MaxBucket);
        map2.reserve(MaxBucket);

        // Split the strings, create string_views and store in maps
        size_t count = 0;
        for (; currentCharPtr < textEndPtr; ++currentCharPtr, ++count) {

            // Split citerium
            if (*currentCharPtr < 'A') {

                // Set pointer to one after the delimiting character
                ++currentCharPtr;

                // Either we are just reading the left or the right side
                if (readLeftSideOfRelation)
                    secondView = { currentCharPtr , count };
                else
                {
                    // Now we read the right side of the relation. And with that a pair
                    // Store relation in each direction
                    map1[firstView] = secondView;
                    map2[secondView] = firstView;

                    // Start start of next left side
                    firstView = { currentCharPtr , count };
                }
                // Next, we want to read the other part of the relation
                readLeftSideOfRelation = not readLeftSideOfRelation;
                count = 0;
            }
        }
        // Here we will store the sorted result
        std::deque<std::string_view> result{};

        auto tp3 = std::chrono::high_resolution_clock::now();

        // Set parts of the relation
        std::string_view related = map1.begin()->second;
        std::string_view relatedSave = map1.begin()->first;

        // Go through the sequence
        for (;;) {
            auto it = map1.find(related);
            result.push_back(related);
            if (it == map1.end())
                break;
            related = it->second;
            map1.erase(it);
        }
        // We reached end, search in other direction
        related = relatedSave;
        for (;;) {
            auto it = map2.find(related);
            result.push_front(related);

            if (it == map2.end())
                break;
            related = it->second;
            map2.erase(it);
        }

        // Show timing result
        auto tp4 = std::chrono::high_resolution_clock::now();
        std::cout << "Sort: " << duration_cast<std::chrono::milliseconds>(tp4 - tp3).count() << "ms\n";

        auto tp2 = std::chrono::high_resolution_clock::now();
        std::cout << "Overall: " << duration_cast<std::chrono::milliseconds>(tp2 - tp1).count() << "ms\n";
    
        std::cout << "Count:  " << result.size() << '\n';
    }
}

Solution of "WhozCraig", instrumented for aditional runtime measurement and used for the tests.

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include <chrono>

void search_bricks_backwards
(
    std::string resume,
    std::unordered_map<std::string, std::string>& rbricks,
    std::deque<std::string>& dst
)
{
    while (true)
    {
        auto it = rbricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == rbricks.end())
            break;

        resume = it->second;
        rbricks.erase(it);
    }
}

void search_bricks
(
    std::unordered_map<std::string, std::string>& bricks,
    std::unordered_map<std::string, std::string>& rbricks,
    std::deque<std::string>& dst
)
{
    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    while (true)
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end())
            break;

        start = it->second;
        bricks.erase(it);
    }
    // same search, but different keyed map
    search_bricks_backwards(resume, rbricks, dst);
}


int main()
{
    using namespace std::chrono;
    //createTestData();
    auto tp0 = high_resolution_clock::now();

    std::unordered_map<std::string, std::string> bricks, rbricks;
    std::deque<std::string> sorted_bricks;

    std::ifstream inFile("test.txt");
    if (inFile.is_open())
    {
        for (std::string line; std::getline(inFile, line);)
        {
            // make damn sure we get two keys
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') &&
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
                rbricks.emplace(std::make_pair(s2, s1));
            }
        }
        auto tp3 = high_resolution_clock::now();

        search_bricks(bricks, rbricks, sorted_bricks);

        auto tp4 = high_resolution_clock::now();
        auto tp1 = high_resolution_clock::now();
        std::cout << "\nSort:  "<< duration_cast<milliseconds>(tp4 - tp3).count() << "ms\n";
        std::cout << "Overall:  " << duration_cast<milliseconds>(tp1 - tp0).count() << "ms\n";

        std::cout << "Count:  " << sorted_bricks.size() << '\n';
    }
}

In an additional answer from user "subspring", a trie was proposed.

I think it is not feasible. Look at the potential memory consumption.

We have upper- and lowercase characters. So, overall 52 characters. For each we would need a 4 byte pointer + 1 pointer for related string. That is 53*4 for one level of the trie. The string length is 6. And we need 2 tries. So, the overall memory consumption for a classical tree would be:

((52+1)*4) ^ 6 * 2 = 181'570'446'368'768 ~182 TB.

The 128GB RAM that is in my computer is rediculous small compared to this number.

For that reason I implemented a trie with a space optimized design using std:unordered_maps as link element.

With that I could reduce the RAM memory consumption to 25GB.

But also here the result was disappointing.

Even with good will, it was at least 3 times slower . . .

See the example code:

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <unordered_map>
#include <vector>
#include <random>
#include <unordered_set>
#include <array>
#include <deque>
#include <list>
// ----------------------------------------------------------------------------------------------------------------
// Generate test data
// We will generate that many strings
constexpr size_t MaxPairs = 1'000'000u;
//constexpr size_t MaxPairs = 500000u;

// The strings will all be 6 characters long and consist of upper- and lowercase letters
constexpr size_t StringLength = 6u;
constexpr std::array<char, 52> Letter{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

// Filename for test data
const std::string testFileName{ "r:\\text1m.txt" };

// Create a file with test date following the given pattern
void createTestData() {

    // We need random numbers to select a letter from above string
    std::random_device randomDevice;
    std::mt19937 randomGenerator(randomDevice());
    std::uniform_int_distribution<> randomIntDistribution(0, (int)Letter.size() - 1);

    // Function to create a random string
    auto generateString = [&]() -> std::string
    {
        std::string s(6, 0);
        for (size_t i{}; i < StringLength; ++i)
            s[i] = Letter[randomIntDistribution(randomGenerator)];
        return s; };

    // We want to have unique, not sorted strings. So, use a std::unordered_set
    std::unordered_set<std::string> uniqueStrings{};

    // Now generate the requested numbers of unique strings
    while (uniqueStrings.size() < MaxPairs + 1)
        uniqueStrings.insert(generateString());

    // Move them to a std::vector for random access possibility
    std::vector<std::string> sss(std::make_move_iterator(uniqueStrings.begin()), std::make_move_iterator(uniqueStrings.end()));

    // Now we create a vectore of pair of indices, like 0->1, 1->2, 2->3 and so on
    std::vector<std::pair<int, int>> index(MaxPairs);
    for (int i = 0; i < MaxPairs; ++i) {
        index[i].first = i;
        index[i].second = i + 1;
    }
    // This we will shuffle
    std::shuffle(index.begin(), index.end(), randomGenerator);

    // And then store the data in a text file
    if (std::ofstream ofs{ testFileName }; ofs) {
        for (size_t i = 0; i < MaxPairs; ++i)
            ofs << sss[index[i].first] << ',' << sss[index[i].second] << '\n';
    }
}


struct TrieNode;
using CharacterWithLink = std::unordered_map<char, TrieNode*>;
using LinkIter = CharacterWithLink::iterator;

struct TrieNode {
    const char* related{};
    CharacterWithLink link{};
};

struct Trie {
    TrieNode root{};
    const char* beginOfWord{};
    LinkIter iter{};
    TrieNode* tnp{};
    std::vector< TrieNode> memory{};
    size_t index{};

    static constexpr size_t MaxMemory = static_cast<size_t>((float)MaxPairs * 3.6);
    //static constexpr size_t MaxMemory = static_cast<size_t>((float)MaxPairs * 10);
    Trie() { memory.resize(MaxMemory); }
    ~Trie() {}

    TrieNode* addWord(const char *characterOfWord) {
        tnp = &root;
        beginOfWord = characterOfWord;

        while (*characterOfWord >= 'A') {
            iter = tnp->link.find(*characterOfWord);

            if (iter == tnp->link.end())
                tnp->link[*characterOfWord] = &memory[index++];
            
            tnp = tnp->link[*characterOfWord];
            ++characterOfWord;
        }
        //tnp->related = beginOfWord;
        return tnp;
    }
    const char* find(const char* characterOfWord) {
        tnp = &root;
        const char* result{};
        while (*characterOfWord >= 'A') {
            iter = tnp->link.find(*characterOfWord);

            if (iter == tnp->link.end()) break;

            tnp = tnp->link[*characterOfWord];
            ++characterOfWord;
        }
        result = tnp->related;
        return result;
    }
};

struct BiRelationalTrie {
    Trie left{};
    Trie right{};

    void fill(std::string& text);
};

void BiRelationalTrie::fill(std::string & text) {

    const char* currentCharPtr{ text.data() };
    const char* textEndPtr{ text.data() + text.length() };

    // We want to identify the pair. A pair of a relation consists of a left and right part. Start of course with left
    bool readLeftSideOfRelation = true;

    const char* leftWordPtr{ text.data() };
    const char* rightWordPtr{};

    for(; currentCharPtr < textEndPtr; ++currentCharPtr) {

        // Split citerium
        if (*currentCharPtr < 'A') {

            // Set pointer to one after the delimiting character
            ++currentCharPtr;

            // Either we are just reading the left or the right side
            if (readLeftSideOfRelation)

                // We were reading the left site, now, set the start pointer for the right side
                rightWordPtr = currentCharPtr;
            else
            {
                // Now we have read the right side of the relation. And with that a complete pair
                // Store relation in each direction
                left.addWord(leftWordPtr)->related = rightWordPtr;
                right.addWord(rightWordPtr)->related = leftWordPtr;

                // Set start of next left side
                leftWordPtr = currentCharPtr;
            }
            // Next, we want to read the other part of the relation
            readLeftSideOfRelation = not readLeftSideOfRelation;
        }
    }
    std::cout << left.index << '\n';
}
// A very small speed boost by a bigger input buffer
constexpr size_t ifStreamBufferSize = 1'000'000u;
static char buffer[ifStreamBufferSize];



int main() {
    //createTestData();
    // Start runtime measurement 
    
    
    BiRelationalTrie biRelationalTrie{};

    // Here we will store the sorted result
    std::deque<const char *> result{};

    auto tp1 = std::chrono::high_resolution_clock::now();

    // Open source file and check if it could be opened
    if (std::ifstream ifs{ testFileName }; ifs) {

        // Set bigger file read buffer. Not sure, if it has really an effect.
        ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);

        // Read complete source file into a string
        std::string text(std::istreambuf_iterator<char>(ifs), {});

        biRelationalTrie.fill(text);



        auto tp3 = std::chrono::high_resolution_clock::now();

        // Set parts of the relation
        const char *related = text.data()+7;
        const char* relatedSave = text.data();

        // Go through the sequence
        for (;;) {
            related = biRelationalTrie.left.find(related);
            if (related == nullptr)
                break;
            result.push_back(related);
        }
        // We reached end, search in other direction
        related = relatedSave;
        for (;;) {
            related = biRelationalTrie.right.find(related);
            if (related == nullptr)
                break;
            result.push_front(related);
        }

        // Show timing result
        auto tp4 = std::chrono::high_resolution_clock::now();
        std::cout << "Sort: " << std::chrono::duration_cast<std::chrono::milliseconds>(tp4 - tp3).count() << "ms\n";
        std::cout << result.size() << '\n';

        auto tp2 = std::chrono::high_resolution_clock::now();
        std::cout << "Overall: " << std::chrono::duration_cast<std::chrono::milliseconds>(tp2 - tp1).count() << "ms\n";
    }
}

Upvotes: 1

WhozCraig

Reputation: 66194

First a purpose-driven analogy to what is really being done here. This problem sometimes comes up in interview questions. It is often phrased as a cluster of bus tickets:

You're walking down the street with a thick stack of bus tickets for your whirlwind tour of the surrounding countries. Each ticket has a starting city, and an ending city. You accidentally drop the tickets on the ground and they blow all over the place. You pick them up, but then realize they're out of order. Your task is to put them back in order so you don't have to search the stack each time you need to use the next ticket.

An example, where Sn is s bus station ID, and Sn->Sm denotes a trip from station Sn to station Sm. Given the following four tickets covering five stations in no particular order:

S4->S2
S1->S5
S3->S4
S5->S3

the proper order can be thought of like this:

S1->S5
    S5->S3
        S3->S4
            S4->S2

And therefore, the correct "sorted" order is

S1
S5
S3
S4
S2

Algorithm Analysis

The biggest killer in your algorithm are those sequence scans. Maps offer key lookup operations. A map find member is what makes them tick (and shine). If you want to make this scream, you need to use keying to do so.
A close runner up to that punishing sequential scan that is the exorbitant expense of pre-pending to a std::vector, which can get ridiculously expensive fast. Vectors a sequential contiguous storage. Appending to them is not quite as bad because their sub-allocators tend over-allocate on expansion to leave room for a few more push-backs before having to reallocate. But prepending to them is dreadful.

I scanned the source test you provided. There are actually 49999 rows in the data, not 50000. After some trial, error, head scratching, etc., I discovered this:

bWyUVV appears only once, as a left side.
EZkYGM appears in the file only once, as a right side

These must be the terminals of the sorted list. If everything plays out and the data is pristine, the final list will start with bWyUVV, and end with EZkYGM Not helpful for writing the algorithm, but definitely helpful for validating we did something right.

Performance Improvements All Around

Stripping a lot of code out of this while still keeping the basic premise, consider the following:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include <chrono>

void search_bricks_backwards
(
    std::string resume, 
    std::unordered_map<std::string, std::string>& rbricks, 
    std::deque<std::string>& dst
) 
{
    while (true) 
    {
        auto it = rbricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == rbricks.end()) 
            break;

        resume = it->second;
        rbricks.erase(it);
    }
}

void search_bricks
(
    std::unordered_map<std::string, std::string>& bricks, 
    std::unordered_map<std::string, std::string>& rbricks, 
    std::deque<std::string>& dst
) 
{
    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    while (true) 
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end()) 
            break;

        start = it->second;
        bricks.erase(it);
    }

    // same search, but different keyed map
    search_bricks_backwards(resume, rbricks, dst);
}


int main()
{
    using namespace std::chrono;

    std::unordered_map<std::string, std::string> bricks, rbricks;
    std::deque<std::string> sorted_bricks;

    std::ifstream inFile("input-pairs-50K.txt");
    if (inFile.is_open())
    {
        for (std::string line; std::getline(inFile, line);)
        {
            // make damn sure we get two keys
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') && 
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
                rbricks.emplace(std::make_pair(s2, s1));
            }
        }

        auto tp0 = high_resolution_clock::now();
        search_bricks(bricks, rbricks, sorted_bricks);
        auto tp1 = high_resolution_clock::now();

        std::cout << duration_cast<milliseconds>(tp1-tp0).count() << "ms\n";

        // display results
        int n = 0;
        for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
            std::cout << ++n << ". " << *i << '\n';
    }
}

The most notable differences:

Using std::deque<std::string> as the sorted results. You were paying a dear price for those vector placements. All of your vector operations were either pushing on the back (ok for reserved vectors) or pushing on the front (dreadful performance in vectors). The std::deque is specialized for very fast front and back insertion and pruning. We're not doing the latter, but are heavily doing the former.
Two maps, keying s1==>s2 and s2==>s1 respectively. there are third party containers that do this for you (ex: boost::bimap). For this, however, considering the key sizes, just storing two maps is easy.
Move semantics are used (though not much) where appropriate
During searches, each discovered key is deleted from the map just-searched. This ensures we stop when we're supposed to, and recovers memory footprint fairly quickly.
Reference parameters where applicable. These containers are effectively destroyed while building the result set, but that's ok (and in fact warranted to properly detect a terminal on one end or the other).

The double-keying makes a world of difference. A release build using your test data set on a puny little four-year-old macbook pro laptop produces the following (which you can verify with your know-answer tests).

All code is built with -O2 optimization running Apple clang version 13.0.0 (clang-1300.0.29.30)

std::unordered_map

34ms
1. bWyUVV
2. mPBGRC
3. WfkvWy
4. vjWNHY
5. HudtyD
6. DhxjdV
7. kdWhGX
8. tIsDXh
9. eMVeMX
10. fVQoeG

... 49980 lines omitted ...

49990. YfBDnP
49991. sHKVrT
49992. ZzhoZV
49993. Dyunmj
49994. KCQbpj
49995. rbMSgD
49996. WKOksU
49997. qqMTnq
49998. llrqUI
49999. XYpBnk
50000. EZkYGM

That timing figure is fairly consistent, though there were outliers as high as 42ms and as low as 28ms in my testing. The same harness using a regular std::map resulted in:

std::map

92ms
1. bWyUVV
2. mPBGRC
3. WfkvWy
4. vjWNHY
5. HudtyD
6. DhxjdV
7. kdWhGX
8. tIsDXh
9. eMVeMX
10. fVQoeG

... 49980 lines omitted ...

49990. YfBDnP
49991. sHKVrT
49992. ZzhoZV
49993. Dyunmj
49994. KCQbpj
49995. rbMSgD
49996. WKOksU
49997. qqMTnq
49998. llrqUI
49999. XYpBnk
50000. EZkYGM

So you're definitely using the right container with an unordered keying, you just weren't actually using the keying for a non-trivial portion of your algorithm, making the container basically no better than a sequential scan. That jives with your assessment it really wasn't any better than a vector sequential scan solution; you were basically doing that regardless.

I suspect performance shown above should be able to run sets of your expected 5-million pairs, so long as you can keep it all in memory (twice, sorry about that, but the results show pretty candidly that it is worth that price).

More Memory Friendly (at least a little)

The following does the same thing, but without all the added output, and uses a single called function with the map transposed mid-flight. This alleviates the need for two maps from inception. The overall performance is comparative to the code above; just a different, more memory friendly, implementation.

#include <iostream>
#include <fstream>
#include <algorithm>
#include <string>
#include <sstream>
#include <deque>
#include <unordered_map>
#include <chrono>

using map_type = std::unordered_map<std::string, std::string>;

std::deque<std::string> sort_bricks( map_type& bricks )
{
    std::deque<std::string> dst;

    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    // process by append
    while (true) 
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end()) 
            break;

        start = std::move(it->second);
        bricks.erase(it);
    }

    // invert the remaining map
    {
        map_type mres;
        for (auto& pr : bricks)
            mres.emplace(std::move(pr.second), pr.first);
        std::swap(bricks, mres);
    }
    
    // process by prepend
    while (true) 
    {
        auto it = bricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == bricks.end()) 
            break;

        resume = std::move(it->second);
        bricks.erase(it);
    }

    return dst;
}

int main()
{
    using namespace std::chrono;

    std::ifstream inFile("input-pairs-50K.txt");
    if (inFile.is_open())
    {
        map_type bricks;
        for (std::string line; std::getline(inFile, line);)
        {
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') && 
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
            }
        }

        auto tp0 = high_resolution_clock::now();
        auto sorted_bricks = sort_bricks(bricks);
        auto tp1 = high_resolution_clock::now();

        std::cout << "Size: " << sorted_bricks.size() << '\n';
        std::cout << "Time: " << duration_cast<milliseconds>(tp1-tp0).count() << "ms\n";
    }
}

Upvotes: 3

subspring

Reputation: 700

You can use a trie data structure, here's a paper that explains an algorithm to do that: https://people.eng.unimelb.edu.au/jzobel/fulltext/acsc03sz.pdf

But you have to implement the trie from scratch because as far as I know there is no default trie implementation in c++.

Upvotes: 1

C++ what is the best sorting container and approach for large datasets (millions of lines)

Answers (3)

Related Questions