threading can not improve the script speed

Question

I will analysis the apache'slog file about 1G.I write a python script,get result about 18 second. the script is:

#!/usr/bin/python
import sys
filename = sys.argv[1]
name = {}
with open(filename,"r") as data:
    for i in data:
        av = i.split()
        if name.has_key(av[7]):
            name[av[7]] = int(av[4])
        else
            name[av[7]] += int(av[4])


mm = open("ziyou_end","w")
#print result to ziyou_end
for i in name:
    mm.write("%s   %s
" %(i,name[i]))

I will improve the speed of script,then i use threading.

#!/usr/bin/python
import threading
import Queue
import sys
import time
all = {}
def do_work(in_queue, out_queue):
    while True:
        #print 1
        item = in_queue.get()
        #print "item is",item
        #time.sleep(1)
        # process
        aitem = item.split()
        if all.has_key(av[7]):
            all[av[7]] = int(av[4])
        else:
            all[av[7]] += int(av[4])

        #out_queue.put(all)
        #print all
        in_queue.task_done()

if __name__ == "__main__":
    work = Queue.Queue()
    results = Queue.Queue()
    af = open(sys.argv[1],"r")
    #get file
    af_con = []
    for i in af:
        af_con.append(i);

    # start for workers
    for i in xrange(4):
        t = threading.Thread(target=do_work, args=(work, results))
        t.daemon = True
        t.start()
        #start 4 threading

    # produce data
    for i in af_con:
        work.put(i)


    work.join()

    result = open ("result_thread","w");
    # get the results
    for i in all:
        result.write(i+str(all[i])+"
")

    sys.exit()

but it spend 320 second to get the result ,who can tell me ,why

I use multiprocessing is the same ,spent long time get the result

#!/usr/bin/env python
#coding:utf-8
from multiprocessing import Pool
import time
import os
import time
import sys
filename = sys.argv[1] 
ALL = {}
def process_line(line):
    global ALL
    av = line.split()
    i = av[7]
    if ALL.has_key(i):
       ALL[i] = ALL[i] + int(av[4]) 
    else:
       ALL[i] = int(av[4])

if __name__ == "__main__":
    pool = Pool(12)
    with open(filename,"r") as source_file:
        # chunk the work into batches of 4 lines at a time
        results = pool.map(process_line, source_file, 1)

I do not know ,why

threading can not improve the script speed

Answers (1)

Related Questions