Reputation: 3598
Hardware: Darwin Kernel Version 13.2.0: Thu Apr 17 23:03:13 PDT 2014; root:xnu-2422.100.13~1/RELEASE_X86_64 x86_64
atomics.hpp 1 #ifndef ATOMIC_UTILS_H 2 #define ATOMIC_UTILS_H 3 4 #include 5 6 #define BARRIER() __asm__ volatile ( "": : :"memory" ) 7 8 #define CPU_RELAX() __asm__ volatile( "pause\n\t": : :"memory" ) 9 10 #define STORE_FENCE() __asm__ volatile("mfence" ::: "memory"); 11 12 class AtomicUtils 13 { 14 public: 15 16 /** 17 * check if the value at addr is equal to oldval, if so replace it with newva l 18 * and return the oldval 19 */ 20 inline static size_t compareAndExchange( volatile size_t* addr, size_t oldval , size_t newval ) 21 { 22 size_t ret; 23 __asm__ volatile( "lock cmpxchgq %2, %1\n\t" 24 :"=a"(ret), "+m"(*addr) 25 : "r"(newval), "0"(oldval) 26 : "memory" ); 27 return ret; 28 } 29 30 /** 31 * Atomically stores x into addr and returns the previous 32 * stored in addr 33 */ 34 inline static size_t loadAndStore( size_t x, volatile size_t* addr ) 36 { 37 size_t ret; 38 __asm__ volatile( "lock xchgq %1, %0\n\t" 39 : "+m"(*addr), "=r"(ret) 40 : "1"(x) ); 41 return ret; 42 } 43 44 }; 45 46 #endif
mcs.hpp 1 #ifndef MCS_LOCK_H 2 #define MCS_LOCK_H 3 4 #include "atomics.hpp" 5 #include 6 7 class MCSLock 8 { 9 struct mcs_lock_t 10 { 11 mcs_lock_t():next(0), locked(false){} 12 struct mcs_lock_t* next; 13 bool locked; 14 }; 15 16 public: 17 typedef struct mcs_lock_t mcs_lock; 18 19 private: 20 mcs_lock** tail; 21 static boost::thread_specific_ptr tls_node; 22 23 public: 24 MCSLock( mcs_lock** lock_tail ):tail( lock_tail ) 25 { 26 if( tls_node.get() == 0 ) 27 tls_node.reset( new mcs_lock() ); 28 } 29 30 void lock() 31 { 32 mcs_lock* thread_node = tls_node.get(); 33 thread_node->next = 0; 34 thread_node->locked = true; 35 36 volatile mcs_lock* pred = reinterpret_cast( 37 AtomicUtils::loadAndStore( 38 reinterpret_cast( thread_node ), 39 reinterpret_cast( tail ) 40 ) 41 ); 42 if( pred != 0 ) 43 { 44 pred->next = *tail; 45 46 STORE_FENCE(); 47 //BARRIER(); // Required to prevent re ordering between prev->next = tail and thread_node->locked. ( WR harzard ) 48 49 // Spin on a local variable. Someone unlock me plz !! 50 while( thread_node->locked ) 51 CPU_RELAX(); 52 53 } 54 } 55 56 void unlock() 57 { 58 mcs_lock* thread_node = tls_node.get(); 59 if( thread_node->next == 0 ) 60 { 61 // If false, then we a new thread has request for lock. Now release t he lock for the new thread 62 if( 63 AtomicUtils::compareAndExchange( 64 reinterpret_cast( tail ), 65 reinterpret_cast( thread_node ), 66 0 67 ) == reinterpret_cast( thread_node ) 68 ) 69 { 70 return; 71 } 72 73 while( thread_node->next == 0 ) 74 CPU_RELAX(); 75 } 76 77 thread_node->next->locked = false; 78 } 79 }; 80 81 boost::thread_specific_ptr MCSLock::tls_node; 82 #endif
mcs_test.cpp
1 #include "mcs.hpp"
2 #include <iostream>
3 #include <pthread.h>
4 #include <vector>
5 #define NUM_THREADS 16
6 #define NUM_ITERATIONS 100
7
8 std::vector<int> elements;
9 MCSLock::mcs_lock *tail = 0;
10
11 void* thread_run( void* data )
12 {
13 MCSLock lock( &tail );
14 for( int i = 0; i < NUM_ITERATIONS; ++i )
15 {
16 lock.lock();
17 elements.push_back( i );
18 lock.unlock();
19 }
20
21 return 0;
22 }
23
24 int main()
25 {
26 pthread_t threads[ NUM_THREADS ];
27 elements.reserve( NUM_THREADS * NUM_ITERATIONS );
28
29 {
30 for( int i = 0; i < NUM_THREADS; ++i )
31 pthread_create( &threads[i], NULL, thread_run, NULL );
32
33 for( int i = 0; i < NUM_THREADS; ++i )
34 pthread_join( threads[i], NULL );
35
36 std::cout <<"\nExiting main thread: " << std::endl;
37 }
38 }
The above code is compiled using clang
Problem:
I see that 1 or 2 threads are stuck in lock() in line 50. Except the main threads, the threads which are stuck in lock() there are no other threads alive. This means that when the other threads invoke unlock() they somehow don't set the locked = false for other variables and exit.
Any pointers on debugging this please ?
Stuck on this for many hours and no clues.
Upvotes: 1
Views: 481
Reputation: 7483
Doesn't clang have builtins for these inline-asm blocks (like gcc's __sync_val_compare_and_swap)? Why re-invent the wheel?
Second, I'd really think about adding the memory clobber to loadAndStore. You need to make sure that any writes the compiler is holding in registers gets flushed to memory before doing the xchgq. Similarly it will prevent gcc from optimizing memory reads to before the xchgq. Either would be bad.
Third, I'd examine the asm output for your while loops (thread_node->locked and thread_node->next). Since these variables are not volatile, gcc may optimize this to only perform the read once.
These may not solve your problem, but that's where I'd start.
Upvotes: 1