Reputation: 13
I am developing a project in which I have to model (arbitrary) computations that happen in pipeline. The pipeline is made of stages, each stage takes the input from the previous stage (except the first, who directly receives tasks from the pipeline object), makes a computation and sends the result to the next stage. Each stage is implemented with a separate thread of execution.
The pipeline should have a basic load balancing capability: if (after a while) it recognizes that the sum of the execution times of two consecutive stages is smaller than the execution time of the slowest stage, it "collapses" those two stages, that is it makes both of them run sequentially, using a single thread.
There are three classes in the project: classes Pipeline and Stage are obvious, while class TSOHeap (Thread-Safe Ordered heap) is the buffer used in input by each stage. It has a maximum size and the capability to give highest priority to special messages indicating that a Stage has to be collapsed.
My question is: why if I compile without optimizations the code runs smoothly (or at least does not block), while if I compile with optimizations ( -O2, -O3
) the program blocks? If I run the program with the debugger it blocks few times; if I run the program "normally" from terminal it blocks almost always.
The strange thing is that a thread blocks on a line in which there is a simple print. Before I added that print (for debugging purpose), the program blocked on the previous line, which is the guard of a while loop.
I guess the problem is related to synchronization among threads, but I don't know how to discover the faulty part. The only constant is that the program blocks after the method collapse_next_stage()
has been invoked, that is after a thread has been stopped.
Any suggestion would be appreciated, even general procedures to discover bugs like these. I report the code to run an example:
Class "TSOHeap.hpp":
#include <mutex>
#include <queue>
#include <vector>
#include <atomic>
#include <climits>
using namespace std;
template<typename T>
struct Comparator{
bool operator()(pair<T,int> p1, pair<T,int> p2){
return p1.second > p2.second;
}
};
//Thread-Safe Ordered Heap
template<typename T>
struct TSOHeap
{
TSOHeap(int _max=10):size{0},max{_max}{};
~TSOHeap(){}
void push(T* item, int id){
while(size==max);
{
lock_guard<mutex> lock(heap_mutex);
heap.push(pair<T*,int>(item, id));
size++;
}
}
pair<T*,int> pop(){
while(size==0);
{
lock_guard<mutex> lock(heap_mutex);
pair<T*,int> p = heap.top();
heap.pop();
size--;
return p;
}
}
priority_queue<pair<T*,int>, vector<pair<T*,int>>,Comparator<T*>> heap;
atomic<int> size;
int max;
mutex heap_mutex;
};
Class "Stage.hpp":
#include "TSOHeap.hpp"
#include <iostream>
#include <thread>
#include <vector>
#include <chrono>
#include <mutex>
using namespace std;;
struct IStage{
virtual void run() = 0;
virtual void wait_end() = 0;
virtual void stage_func() = 0;
virtual double get_exec_time() = 0;
virtual void reset_exec_time()=0;
virtual void add_next(IStage&)=0;
virtual IStage* get_next() = 0;
virtual void* get_input_ptr() = 0;
virtual void set_input(void*) = 0;
virtual void collapse() = 0;
virtual bool is_collapsed() = 0;
virtual void collapse_next_stage() = 0;
virtual int num_collapsed() = 0;
~IStage(){};
};
template <typename Tin, typename Tf, typename Tout>
struct Stage : IStage{
Stage(Tf function, int ind):fun{function}, input_ptr{new(TSOHeap<Tin>)},_end{false},
next{nullptr}, collapsed{0}, i{ind}, exec_time{0.0},count{0},collapsing{false},c{0}{};
~Stage(){delete input_ptr;}
void stage_func(){
Tin * input = input_ptr->pop().first;
if (input!=nullptr){
auto start = chrono::system_clock::now();
Tout out = fun(*input);
auto end = chrono::system_clock::now();
chrono::duration<double> diff = end-start;
set_exec_time(diff.count());
if (next!=nullptr)
next->set_input(new Tout(out));
}
else
_end = true;
}
void run_thread(){
while(!_end){
cout << "t " << i << ", r " << ++c << endl; // BLOCKS HERE
while(collapsing); //waiting that next stage finishes the remaining tasks
stage_func();
if(collapsed==1 && !_end)
next->stage_func();
}
if(collapsed!=-1){
IStage * nptr = next;
if(nptr!=nullptr && nptr->is_collapsed())
nptr = nptr->get_next();
if(nptr!=nullptr)
nptr->set_input(nullptr);
}
else{
while((input_ptr->size)>0)
stage_func();
}
}
void run()
{
thread _t(&Stage::run_thread, this);
t = move(_t);
return;
}
void wait_end()
{
t.join();
}
void set_input(void * iptr)
{
input_ptr->push(static_cast<Tin*>(iptr), ++count);
}
void* get_input_ptr()
{
return input_ptr;
}
void add_next(IStage &n)
{
next = &n;
output_ptr = static_cast<TSOHeap<Tout>*>(n.get_input_ptr());
}
void collapse()
{
collapsed=-1;
input_ptr->push(nullptr, INT_MIN);
// First condition is to avoid deadlock, in case this thread finished the execution in the meanwhile
while(!_end && (input_ptr->size) > 0);
}
bool is_collapsed()
{
return collapsed==-1;
}
void collapse_next_stage()
{
collapsing = true;
next->collapse();
collapsed++;
collapsing = false;
cout << "Stage # " << i << " has collapsed the successive Stage" << endl;
}
IStage* get_next()
{
return next;
}
double get_exec_time()
{
return exec_time;
}
void reset_exec_time()
{
set_exec_time(0.0);
}
void set_exec_time(double value)
{
lock_guard<mutex> lock(et_mutex);
exec_time = value;
}
int num_collapsed()
{
return collapsed;
}
Tf fun;
TSOHeap<Tin> * input_ptr;
bool _end;
IStage * next;
int collapsed;
int const i;
double exec_time;
int count;
mutex et_mutex;
bool collapsing;
int c;
TSOHeap<Tout> * output_ptr;
thread t;
};
Class "Pipe.hpp":
#include "Stage.hpp"
#include <list>
#include <thread>
#include <algorithm>
using namespace std;;
template <typename Tin, typename Tout>
struct Pipe{
Pipe(list<IStage*>li, int n_samples=10):slowest{-1},end{false},num_samples{n_samples}
{
for(auto& s:li)
add_node(s);
}
void add_node(IStage* sptr)
{
if(!nodes.empty())
nodes.back()->add_next(*sptr);
nodes.push_back(sptr);
}
void set_input(void * in_ptr)
{
nodes.front()->set_input(in_ptr);
}
int num_nodes()
{
return nodes.size();
}
void run()
{
for(auto &x: nodes)
x->run();
}
void run(list<Tin>&& input)
{
thread t(&Pipe::run_manager, this, ref(input));
while(!end)
monitor_times();
t.join();
}
void run_manager(list<Tin>& input)
{
run();
for(auto& x:input)
set_input(&x);
set_input(nullptr);
end=true;
for(auto& s : nodes)
s->wait_end();
}
void monitor_times()
{ // initialization phase
vector<int> count;
vector<double> avg;
vector<priority_queue<pair<double,int>, vector<pair<double,int>>,Comparator<double>>> measures;
for(auto& x : nodes){
count.push_back(0);
avg.push_back(0);
measures.push_back(priority_queue<pair<double,int>,
vector<pair<double,int>>,Comparator<double>>());
}
while(!end){
// monitoring phase
for(int i=0; i<nodes.size(); i++){
if(nodes[i]->get_exec_time()!=0){
pair<double,int> measure = pair<double,int>(nodes[i]->get_exec_time(),++count[i]);
nodes[i]->reset_exec_time();
measures[i].push(measure);
if(count[i]<=num_samples){
avg[i] = (avg[i]*(count[i]-1) + measure.first) / count[i];
}
else
{
double old = measures[i].top().first;
// the ordering of the heap guarantees that I drop the oldest measure
measures[i].pop();
avg[i] = (avg[i] * num_samples - old + measure.first) / num_samples;
}
}
}
// updating phase
if(is_steady_state(count)){
int slowest = get_slowest_stage(avg);
for(int i=0; i<nodes.size()-1; i++){
if(avg[i]+avg[i+1]<avg[slowest]){
if(nodes[i]->num_collapsed()==0 && nodes[i+1]->num_collapsed()==0){
nodes[i]->collapse_next_stage();
break;
}
}
}
}
}
}
bool is_steady_state(vector<int>& count){
for(auto& c: count){
if(c < num_samples) return false;
}
return true;
}
int get_slowest_stage(vector<double>& avg){
double max = 0.0;
int index = -1;
for(int i=0; i<avg.size(); i++){
if(avg[i]>max){
max=avg[i];
index = i;
}
}
return index;
}
int slowest;
bool end;
int num_samples;
vector<IStage*> nodes;
};
Class "main.cpp":
#include<iostream>
#include<functional>
#include <chrono>
#include<cmath>
#include "Pipe.hpp"
using namespace std;;
auto f = [](int x){
int c = 0;
for(int i=0; i<300; i++)
c=sin(i);
return x;
};
auto fast = [] (int x) {return x;};
auto fast_init = [](int x){
if(x < 5)
return x;
int c=0;
for(int i=0; i<300; i++)
c=sin(i);
return x;
};
auto print = [] (int x) {
cout << "Result: " << x << " " << endl;
return x;
};
int main(int argc, char* argv[])
{
auto print_usage_msg = [&](){
cout << "Usage: " << argv[0] << " <func_type> \n" <<
"<func_type> = \n"
" 0 to have 2 consecutive stages running the fast function\n"
" 1 to have 2 consecutive stages running the fast function "
"but after a short time reaching steady state " << endl;
};
if(argc!=2){
print_usage_msg();
return 1;
}
int fun_code = atoi(argv[1]);
if (fun_code!=0 && fun_code!=1){
print_usage_msg();
return 1;
}
Stage<int,function<int(int)>,int> s1{f,1};
Stage<int,function<int(int)>,int> s2{f,2};
Stage<int,function<int(int)>,int> s3{f,3};
Stage<int,function<int(int)>,int> s4{f,4};
Stage<int,function<int(int)>,int> s5{f,5};
Stage<int,function<int(int)>,int> s6{f,6};
Stage<int,function<int(int)>,int> s7{f,7};
Stage<int,function<int(int)>,int> sp{print,8};
if(fun_code==0){
s2.fun = fast;
s3.fun = fast;
}
else{
s2.fun = fast_init;
s3.fun = fast_init;
}
Pipe<int,int> p ({&s1, &s2, &s3, &s4, &s5, &s6, &s7, &sp});
cout << "Pipe length: " << p.num_nodes() << endl;
list<int> li {};
for(int i=0; i<100; i++)
li.push_back(i);
p.run(move(li));
return 0;
}
Compile with:
g++ main.cpp -std=c++11 -pthread -O3 -o gpipe -g
Run with :
./gpipe 1
Thanks for any help!
Upvotes: 1
Views: 86
Reputation: 1702
Imagine the following code for a single-threaded program:
void func()
{
bool a = true;
while(a)
{
// busy wait...
}
}
Will this function ever return? Obviously not. If you were a compiler, how would you write optimized code for this?
1: NOP
2: GOTO 1
This is exactly what you're doing with this bit of code. Twice.
while(!_end){ // here #1
cout << "t " << i << ", r " << ++c << endl;
while(collapsing) // here #2
; // for the love of God, move your semicolon here or use braces
stage_func();
if(collapsed==1 && !_end)
next->stage_func();
}
Your compiler has absolutely no obligation to realize that you're doing multi-threading programming. (It's your job to tell it)
The compiler needs to know not to perform optimizations on _end
and collapsed
. DO NOT USE volatile
. Why? volatile
will keep the compiler from optimizing a variable, but... heh heh... the CPU can also potentially optimize away your writes to _end
and collapsed
from different threads (by keeping them in its cache and not writing to main memory). Compilers and CPU's will also re-order your instructions, which can cause similar problems.
Memory fences (aka memory barriers) can be used to instruct the CPU to do things like push out pending writes or re-update its cached value for reading. They also give guidelines for command re-ordering. AFAIK the std::atomic_thread_fence will prevent compiler reordering but I've read conflicting things about this...
By far the simplest, most-pragmatic, and easiest-to-prove-correct thing to do is just to switch all your inter-thread communicating variables to std::atomic<> types, which incorporate memory barriers. So
std::atomic<bool> _end;
std::atomic<int> collapsed;
As a general rule, any data that is shared between threads should be protected by a mutex OR be an std::atomic<> if race conditions are not an issue (as you are doing with the simple signaling). You can break this rule if you really know what you're doing and really know the architecture, compiler, and standard implementation really well, but that's a tall order even for an expert.
By the way, a mutex's lock and unlock operation both incorporate a memory barrier, in case you were worried about that. So when you get a pointer from the TSOHeap, that's fine (assuming your TSOHeap implementation is correct...I didn't look at it).
Upvotes: 1
Reputation: 136306
You have race conditions in TSOHeap
when using size
. While size
is atomic, it is a part of larger state that is not atomic, so that changes in size
are not synchronized with changes to the rest of the state.
Make size
non-atomic and access it only when the mutex is locked. Add condition variables to notify threads waiting in push
and pop
.
Alternatively, remove size
entirely. Example:
template<typename T>
struct TSOHeap
{
TSOHeap(size_t _max=10): max{_max}{}
void push(T* item, int id){
unique_lock<mutex> lock(heap_mutex);
while(heap.size() == max)
cnd_pop.wait(lock);
heap.push(pair<T*,int>(item, id));
cnd_push.notify_one();
}
pair<T*,int> pop() {
pair<T*,int> result = {};
{
unique_lock<mutex> lock(heap_mutex);
while(heap.empty())
cnd_push.wait(lock);
bool notify = heap.size() == max;
result = heap.top();
heap.pop();
if(notify)
cnd_pop.notify_one();
}
return result;
}
mutex heap_mutex;
condition_variable cnd_push, cnd_pop;
priority_queue<pair<T*,int>, vector<pair<T*,int>>,Comparator<T*>> heap;
size_t const max;
};
Upvotes: 1