Which is the best way to parallelly running multiple tasks in Python

Question

I have a function:

import time

def all_40k():
    for _ in range(400000):
        print('validate')
        print('parsing')
        print('inserting')
if __name__ == '__main__':
    start_time = time.time()
    all_40k()
    print(f'used time:{time.time()-start_time}')

The output is:

used time:9.545064210891724

Because this a same function repeated 40k times,so I want to have 4 parallelly functions running at the same time each function running 10k,ideally this will be 4 times faster.

So I first tried multiple threading:

import threading
import time
def first_10k():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')


def second_10k():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')

def third_10k():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')

def forth_10k():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')

thread1 = threading.Thread(target=first_10k)
thread2 = threading.Thread(target=second_10k)
thread3 = threading.Thread(target=third_10k)
thread4 = threading.Thread(target=forth_10k)

thread1.start()
thread2.start()
thread3.start()
thread4.start()
if __name__ == '__main__':
    start_time = time.time()
    thread1.join()
    thread2.join()
    thread3.join()
    thread4.join()
    print(f'used time:{time.time()-start_time}')

To my surprise ,the output is:

used time:23.058093309402466

And then I tried asyncio:

import time
import asyncio

async def test_1():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')


async def test_2():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')


async def test_3():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')


async def test_4():
    for _ in range(100000):
        print('validate')
        print('parsing')
        print('inserting')


async def multiple_tasks():
  input_coroutines = [test_1(), test_2(), test_3(),test_4()]
  res = await asyncio.gather(*input_coroutines, return_exceptions=True)
  return res

if __name__ == '__main__':
  start_time = time.time()
  res1, res2 ,res3,res4 = asyncio.get_event_loop().run_until_complete(multiple_tasks())
  print(f'used time:{time.time()-start_time}')

The output is:

used time:9.295843601226807

In the end I tried ProcessPoolExecutor:

import time
from concurrent.futures import ProcessPoolExecutor
def data_handler(urls):
    for i in range(urls[0], urls[1]):
        print('validate')
        print('parsing')
        print('inserting')

def run():
    urls = [(1,100000),(100001,200000),(2000001,300000),(300001,400000)]
    with ProcessPoolExecutor() as excute:
        excute.map(data_handler,urls)

if __name__ == '__main__':
    start_time = time.time()
    run()
    stop_time = time.time()
    print('used time %s' % (stop_time - start_time))

The output is:

used time 12.726619243621826

So how can I speed up the process? I think I went the wrong way.Any friend can help? Best regards!

Which is the best way to parallelly running multiple tasks in Python

Answers (1)

Related Questions