Reputation: 2107
I trying to solve this following problem: Give a vector V[] of integers with positive and negetive. A number N is paired with its negative counter part, which is -N. Now if there are pairs of such numbers in the given vector V[], take the positive integer and push them to a return result vector.
Example:
If input is V = [1,-1,0,2,-3,3]
return [1,3]
I tried to solve this problem in 3 flavors:
Single Threaded | Runtime: 404000
Multithreaded course grained lock | Runtime: 39882000
My idea with fine grained locking is to update memory at decrete memory locations based upon the input.
I see that my Multithreaded course grained lock is performing worst than Single Threaded one (which is kind of expected). But what I don't understand is why my Multithreaded fine grained lock is most-of-the-time performing worse than Multithreaded course grained lock, performing poor compared to Single-Threaded version. I expected the *Multithreaded fine grained lock** should perform better than the Single-Threaded version.
What is wrong with my implementation? What am I doing wrong. How can I improve performance of this code with multithreading?
#include <iostream>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <thread>
#include <chrono>
#include <cstdlib>
#include <memory>
using namespace std;
class Solution
{
private:
const static uint32_t THREAD_N = 5;
unordered_map<uint32_t, int32_t> records;
vector<uint32_t> results;
vector<atomic<uint32_t>> atm_results;
mutex mut[THREAD_N];
mutex mutrec;
bool bzero;
public:
Solution(): bzero(true){
records.reserve(100);
}
void InsertVal(const vector<int32_t> &vin)
{
for (auto iter : vin) {
if(iter < 0)
{
if(records[0-iter] > 0) results.emplace_back(0-iter);
records[0-iter]--;
}
else if(iter > 0)
{
if(records[iter] < 0) results.emplace_back(iter);
records[iter]++;
}
else
{
bzero = !bzero;
if (bzero) {
results.emplace_back(0);
}
}
}
}
void InsertValEach(const int32_t &val)
{
lock_guard<mutex> lock(mutrec); // single block of lock
if(val < 0)
{
if(records[0-val] > 0) results.emplace_back(0-val);
records[0-val]--;
}
else if(val > 0)
{
if(records[val] < 0) results.emplace_back(val);
records[val]++;
}
else
{
bzero = !bzero;
if (bzero) {
results.emplace_back(0);
}
}
}
void InsertValEachFree(const int32_t &val)
{
if(val < 0)
{
lock_guard<mutex> lock(mut[(0-val)%THREAD_N]); // finer lock based on input
if(records[0-val] > 0)
{
lock_guard<mutex> l(mutrec); // yet another finer lock to update results
results.emplace_back(0-val);
}
records[0-val]--;
}
else if(val > 0)
{
lock_guard<mutex> lock(mut[(val)%THREAD_N]);
if(records[val] < 0)
{
lock_guard<mutex> l(mutrec);
results.emplace_back(val);
}
records[val]++;
}
else
{
lock_guard<mutex> lock(mut[0]);
bzero = !bzero;
if (bzero) {
lock_guard<mutex> l(mutrec);
results.emplace_back(0);
}
}
}
vector<uint32_t> GetResult()
{
lock_guard<mutex> l(mutrec);
return results;
}
void reset()
{
lock_guard<mutex> l(mutrec);
results = vector<uint32_t>();
}
};
void Display(Solution &s)
{
auto v = s.GetResult();
// for (auto &iter : v) {
// cout<<iter<<" ";
// }
cout<<v.size()<<"\n";
}
size_t SingleThread(Solution &s, const vector<int32_t> &vec)
{
chrono::time_point<chrono::system_clock> start, stop;
start = chrono::system_clock::now();
s.InsertVal(vec);
stop = chrono::system_clock::now();
chrono::duration<double> elapse_time = stop - start;
Display(s);
s.reset();
return chrono::duration_cast<chrono::nanoseconds>(elapse_time).count();
}
size_t CourseGrainLock(Solution &s, const vector<int32_t> &vec)
{
chrono::time_point<chrono::system_clock> start, stop;
vector<thread> vthreads;
auto vsize = vec.size();
start = chrono::system_clock::now();
for (int32_t iter=0; iter<vsize; iter++) {
vthreads.push_back(thread(&Solution::InsertValEach, &s, vec[iter]));
}
stop = chrono::system_clock::now();
for (auto &th : vthreads) {
th.join();
}
chrono::duration<double> elapse_time = stop - start;
Display(s);
s.reset();
return chrono::duration_cast<chrono::nanoseconds>(elapse_time).count();
}
size_t FineGrainLock(Solution &s, const vector<int32_t> &vec)
{
chrono::time_point<chrono::system_clock> start, stop;
vector<thread> vthreads;
auto vsize = vec.size();
start = chrono::system_clock::now();
for (int32_t iter=0; iter<vsize; iter++) {
vthreads.push_back(thread(&Solution::InsertValEachFree, &s, vec[iter]));
}
stop = chrono::system_clock::now();
for (auto &th : vthreads) {
th.join();
}
chrono::duration<double> elapse_time = stop - start;
Display(s);
s.reset();
return chrono::duration_cast<chrono::nanoseconds>(elapse_time).count();
}
int main(int argc, const char * argv[]) {
vector<int32_t> vec;
int count = 1000;
while(count--)
{
vec.emplace_back(rand()%50);
vec.emplace_back(0-(rand()%50));
}
Solution s;
auto nanosec = SingleThread(s, vec);
cout<<"Time of Execution (nano) Single Thread: "<<nanosec<<"\n";
nanosec = CourseGrainLock(s, vec);
cout<<"Time of Execution (nano) Course Grain: "<<nanosec<<"\n";
nanosec = FineGrainLock(s, vec);
cout<<"Time of Execution (nano) Fine Grain: "<<nanosec<<"\n";
return 0;
}
Upvotes: 0
Views: 105
Reputation: 5635
the major issue i see with your code is the majority of the work is being done inside the mutex. That completely blocks the other threads, so there is no benefit. Even the fine grained one is only doing a very small calculation outside the mutex compared to the cost of updating the map ant the output vector.
I'm not even totally convinced your finegrained locking is completely thread safe? If the array index might create nodes in the map for a value that hasn't been seen before, then that invalidates any other thread's simultaneous searches. You could use a separate map for each locked value range I think.
But to be honest I think you are just doing too little work in each thread. Try creating a smaller number of threads and have each one do a range of the input values - calling the existing code for each entry in that range.
Upvotes: 0
Reputation: 32727
You're creating one thread for each number in vec
. There is a considerable cost in creating a thread. You should create a few threads (no more than the number of execution units in your hardware) and have each thread process multiple entries of the vector. main
can run one set of results, thus avoiding creating of one thread.
With the locking in CourseGrainLock
(in InsertValEach
), since the first thing each thread does is grab a lock that is not release until the function is done, your code is effectively single threaded but with the cost of creating all those threads.
The locking in your FineGrainLock
(in InsertValEachFree
) is not much better. You have several locks, but you make changes to results
in multiple threads with different locks. Adding elements to an unordered map (which you do with results[i]
or results[0-i]
is not thread safe, and you risk Undefined Behavior with that code.
A reasonable approach here is to have each thread keep track of its own results independently, thus avoiding the need for locks at all, and combine them into the main results once all the threads are done.
Upvotes: 3
Reputation: 8995
You probably can't improve it with multithreading. All of the threads have to access the same shared input vector and result vector. The tremendous slowdown that you see vs. the single-threaded solution is the overhead of serializing the access to the shared structure.
Multithreading is not a panacea. If you need to do something like this to an array, "just do it." Single-thread.
Upvotes: 0