Execution of code which is locked should take more time than its unlocked counterpart

Question

I am trying to experiment with python threading and locking. So, I created 2 classes. Both of these classes use threads to increment and decrement a class level variable 'ref'.

In ThreadUnsafeClass, I am not using lock before incrementing and decrementing.

In ThreadSafeClass, I am using lock before incrementing and decrementing.

My assumption was that since lock will force some threads to wait, it should take more time in ThreadSafeClass case.

The results point to ThreadSafeClass being faster.

This is my code (python 2.7)

import threading
import time


class ThreadUnsafeClass(object):
ref = 0

def __init__(self, count_tot=10000):
    self.all_threads = []
    self.count_tot = count_tot
    ThreadUnsafeClass.ref = 0

def inc_ref(self):
    time.sleep(0.1)
    for i in xrange(0, self.count_tot):
        ThreadUnsafeClass.ref += 1

def dec_ref(self):
    time.sleep(0.1)
    for i in xrange(0, self.count_tot):
        ThreadUnsafeClass.ref -= 1


def compute_ref_value(self):
    start_time = time.time()
    for i in xrange(0, 50):
        t1 = threading.Thread(target=self.inc_ref, args=())
        t2 = threading.Thread(target=self.dec_ref, args=())
        t1.start()
        t2.start()
        self.all_threads.append(t1)
        self.all_threads.append(t2)

    for t in self.all_threads:
        t.join()

    print time.time() - start_time, " -> ",

    return ThreadUnsafeClass.ref

class ThreadSafeClass(object):
ref = 0

def __init__(self, count_tot=10000):
    self.all_threads = []
    self.count_tot = count_tot
    ThreadUnsafeClass.ref = 0
    self.lock = threading.Lock()

def inc_ref(self):
    time.sleep(0.1)
    self.lock.acquire()
    for i in xrange(0, self.count_tot):
        ThreadUnsafeClass.ref += 1
    self.lock.release()

def dec_ref(self):
    time.sleep(0.1)
    self.lock.acquire()
    for i in xrange(0, self.count_tot):
        ThreadUnsafeClass.ref -= 1
    self.lock.release()


def compute_ref_value(self):
    start_time = time.time()
    for i in xrange(0, 50):
        t1 = threading.Thread(target=self.inc_ref, args=())
        t2 = threading.Thread(target=self.dec_ref, args=())
        t1.start()
        t2.start()
        self.all_threads.append(t1)
        self.all_threads.append(t2)

    for t in self.all_threads:
        t.join()

    print time.time() - start_time, " -> ",

    return ThreadUnsafeClass.ref

thread_unsafe_class = ThreadUnsafeClass(100000)
print "Value from un-safe threading ", 
thread_unsafe_class.compute_ref_value()

thread_safe_class = ThreadSafeClass(100000)
print "Value from safe threading ", thread_safe_class.compute_ref_value()

Here are my results:

Value from un-safe threading 3.54868483543 -> 30653

Value from safe threading 2.28372502327 -> 0

Please help me understand why locking way is faster!

B. Morris · Accepted Answer

I believe the answer is that by locking the way your code does, you are actually avoiding thread and cache thrashing which makes it faster, since the loop for each thread can complete without any other hardware resource contention. It's not really an apples-to-apples comparison, but by moving the lock into the loop instead of outside it:

def inc_ref(self):
    time.sleep(0.1)
    for i in xrange(0, self.count_tot):
        self.lock.acquire()
        ThreadUnsafeClass.ref += 1
        self.lock.release()

def dec_ref(self):
    time.sleep(0.1)
    for i in xrange(0, self.count_tot):
        self.lock.acquire()
        ThreadUnsafeClass.ref -= 1
        self.lock.release()

I found the execution time to increase dramatically (as you had expected).

To further test this theory, I took your code and added some more detailed timing to capture exactly how much time was taken by the increment/decrement operations vs the locking:

import threading
import time
import operator

class ThreadUnsafeClass(object):
    ref = 0

    def __init__(self, count_tot=10000):
        self.all_threads = []
        self.count_tot = count_tot
        ThreadUnsafeClass.ref = 0

    def inc_ref(self, ndx):
        time.sleep(0.1)
        ref_time = 0
        for i in xrange(0, self.count_tot):
            op_start = time.time()
            ThreadUnsafeClass.ref += 1
            ref_time += time.time() - op_start
        self.op_times[ndx] = ref_time

    def dec_ref(self, ndx):
        time.sleep(0.1)
        ref_time = 0
        for i in xrange(0, self.count_tot):
            op_start = time.time()
            ThreadUnsafeClass.ref -= 1
            ref_time += time.time() - op_start
        self.op_times[ndx] = ref_time


    def compute_ref_value(self):
        start_time = time.time()
        self.op_times = [0]*100
        for i in xrange(0, 50):
            t1 = threading.Thread(target=self.inc_ref, args=(i*2,))
            t2 = threading.Thread(target=self.dec_ref, args=(i*2+1,))
            t1.start()
            t2.start()
            self.all_threads.append(t1)
            self.all_threads.append(t2)

        for t in self.all_threads:
            t.join()

        op_total = reduce(operator.add, self.op_times)

        print time.time() - start_time, op_total, " -> ",

        return ThreadUnsafeClass.ref

class ThreadSafeClass(object):
    ref = 0

    def __init__(self, count_tot=10000):
        self.all_threads = []
        self.count_tot = count_tot
        ThreadUnsafeClass.ref = 0
        self.lock = threading.Lock()

    def inc_ref(self, ndx):
        time.sleep(0.1)
        lock_start = time.time()
        self.lock.acquire()
        lock_time = time.time() - lock_start
        ref_time = 0
        for i in xrange(0, self.count_tot):
            op_start = time.time()
            ThreadUnsafeClass.ref += 1
            ref_time += time.time() - op_start
        self.lock.release()
        self.op_times[ndx] = ref_time
        self.lock_times[ndx] = lock_time

    def dec_ref(self, ndx):
        time.sleep(0.1)
        lock_start = time.time()
        self.lock.acquire()
        lock_time = time.time() - lock_start
        ref_time = 0
        for i in xrange(0, self.count_tot):
            op_start = time.time()
            ThreadUnsafeClass.ref -= 1
            ref_time += time.time() - op_start
        self.lock.release()
        self.op_times[ndx] = ref_time
        self.lock_times[ndx] = lock_time


    def compute_ref_value(self):
        start_time = time.time()
        self.op_times = [0]*100
        self.lock_times = [0]*100
        for i in xrange(0, 50):
            t1 = threading.Thread(target=self.inc_ref, args=(i*2,))
            t2 = threading.Thread(target=self.dec_ref, args=(i*2+1,))
            t1.start()
            t2.start()
            self.all_threads.append(t1)
            self.all_threads.append(t2)

        for t in self.all_threads:
            t.join()

        op_total = reduce(operator.add, self.op_times)
        lock_total = reduce(operator.add, self.lock_times)

        print time.time() - start_time, op_total, lock_total, " -> ",

        return ThreadUnsafeClass.ref

thread_unsafe_class = ThreadUnsafeClass(100000)
print "Value from un-safe threading ", thread_unsafe_class.compute_ref_value()

thread_safe_class = ThreadSafeClass(100000)
print "Value from safe threading ", thread_safe_class.compute_ref_value()

The output was:

Value from un-safe threading  6.93944501877 297.449399471  ->  13057
Value from safe threading  4.08318996429 2.6313662529 197.359120131  ->  0

Showing that the cumulative time just for the increment and decrement (across all threads) was almost 300 seconds with the no-locking case, but less than 3 seconds for the locking case. The locking case did take almost 200 (cumulative) seconds to acquire the lock for all threads, but the total time of locking and increment/decrement is still less in this case.

The thrashing occurs because when you have shared memory that is being accessed by multiple threads running on multiple CPUs (as almost every system has these days), the hardware has to coordinate access to that shared memory between each CPU, and when you have numerous repeated accesses to the same memory (or memory within the same cache line) simultaneously from different sources, the CPUs end up spending a nontrivial amount of time waiting for each other.

When you introduce locking, you're spending time waiting for the lock, but within the lock each thread/CPU has exclusive access to the shared memory so there is no additional overhead to coordinate simultaneous accesses from multiple CPUs.

Execution of code which is locked should take more time than its unlocked counterpart

Answers (1)

Related Questions