Why does my parallel performance top out?

Question

I've been playing around with Python a lot lately, and in comparing numerous parallelization packages, I noticed that the performance increase from serial to parallel seems to top out at 6 processes instead of 8--the number of cores my MacBook Pro (OS X 10.8.2) has.

The attached plot compares the timing of different tasks as a function of number of processes (parallel or sequential). This example is using the python built-int 'multiprocessing' package 'Memory' vs. 'Processor' refers to memory-intensive (just allocating large arrays) vs. computationally intensive (many operations) functions.

What is the cause of the top-out below 8-processes?

enter image description here

(The 'Time's are averaged over 100 function calls for each number of processes)

import multiprocessing as mp
import time
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt

iters       = 100
mem_num     = 1000
pro_num     = 20000
max_procs   = 10

line_width  = 2.0
legend_size = 10
fig_name    = 'timing.pdf'

def UseMemory(num):
    test1 = np.zeros([num,num])
    test2 = np.arange(num*num)
    test3 = np.array(test2).reshape([num, num])
    test4 = np.empty(num, dtype=object)
    return 

def UseProcessor(num):
    test1 = np.arange(num)
    test1 = np.cos(test1)
    test1 = np.sqrt(np.fabs(test1))
    test2 = np.zeros(num)
    for i in range(num): test2[i] = test1[i]
    return np.std(test2)

def MemJob(its): 
    for ii in range(its): UseMemory(mem_num)

def ProJob(its): 
    for ii in range(iters): UseProcessor(pro_num)


if __name__ == "__main__":

    print '
ParTest
'    

    proc_range = np.arange(1,max_procs+1,step=1)

    test_times = np.zeros([len(proc_range),2,2])                 # test_times[num_procs][0-ser,1-par][0-mem,1-pro]
    tot_times  = np.zeros([len(proc_range),2  ])                 #  tot_times[num_procs][0-ser,1-par]

    print ' Testing %2d numbers of processors between [%d,%d]' % (len(proc_range), 1, max_procs)
    print ' Iterations %d, Memory Length %d, Processor Length %d' % (iters, mem_num, pro_num)

    for it in range(len(proc_range)):
        procs = proc_range[it]
        job_arg = procs*[iters]
        print '
 - %2d, Processes = %3d' % (it, procs)

        # --- Test Serial ---
        print ' - - Serial'
        # Test Memory
        all_start = time.time()
        start = time.time()
        map(MemJob, [procs*iters])
        ser_mem_time = time.time() - start

        # Test Processor
        start = time.time()
        map(ProJob, job_arg)
        ser_pro_time = time.time() - start

        ser_time = time.time() - all_start

        # --- Test Parallel : multiprocessing ---
        print ' - - Parallel: multiprocessing'
        pool = mp.Pool(processes=procs)
        # Test Memory
        all_start = time.time()
        start = time.time()
        pool.map(MemJob, job_arg)
        par_mem_time = time.time() - start

        # Test Processor
        start = time.time()
        pool.map(ProJob, job_arg)
        par_pro_time = time.time() - start

        par_time = time.time() - all_start

        print ' - - Collecting'
        ser_mem_time /= procs
        ser_pro_time /= procs
        par_mem_time /= procs
        par_pro_time /= procs
        ser_time     /= procs
        par_time     /= procs

        test_times[it][0] = [ ser_mem_time, ser_pro_time ]
        test_times[it][1] = [ par_mem_time, par_pro_time ]
        tot_times[it]     = [ ser_time    , par_time     ]



    fig = plt.figure()
    ax  = fig.add_subplot(111)
    ax.set_xlabel('Number of Processes')
    ax.set_ylabel('Time [s]')
    ax.xaxis.grid(True)
    ax.yaxis.grid(True)
    lines = []
    names = []

    l1, = ax.plot(proc_range, test_times[:,0,0], linewidth=line_width)
    lines.append(l1)
    names.append('Serial Memory')
    l1, = ax.plot(proc_range, test_times[:,0,1], linewidth=line_width)
    lines.append(l1)
    names.append('Serial Processor')
    l1, = ax.plot(proc_range, tot_times[:,0], linewidth=line_width)
    lines.append(l1)
    names.append('Serial')

    l1, = ax.plot(proc_range, test_times[:,1,0], linewidth=line_width)
    lines.append(l1)
    names.append('Parallel Memory')
    l1, = ax.plot(proc_range, test_times[:,1,1], linewidth=line_width)
    lines.append(l1)
    names.append('Parallel Processor')
    l1, = ax.plot(proc_range, tot_times[:,1], linewidth=line_width)
    lines.append(l1)
    names.append('Parallel')

    plt.legend(lines, names, ncol=2, prop={'size':legend_size}, fancybox=True, shadow=True, bbox_to_anchor=(1.10, 1.10))
    fig.savefig(fig_name,dpi=fig.get_dpi())
    print ' - Saved to ', fig_name
    plt.show(block=True)

Why does my parallel performance top out?

Answers (1)

Related Questions