Reputation: 11
I have a script (Django Management-Command) wiht over 800 lines of code. This should import data from a external Web-Service, manipulate sth. and write it to a Postgres DB.
I use multithreading, because fetching data from webservice ist not very fast.
There ist one Thread for fetching the data with a bulk command to get a bulk of 64 data sets an write each data set in a queue.
Simultaneously at the beginning there is one worker-thread wich manipulates the data and write it to a DB. In the main (handle) class, there is a while-loop that looks every 5 seconds for the quantity of elements in the queue and the quantity of running worker-threads. If there are more than 500 elements in the queue and there are less then 5 worker-threads, it starts a new worker-thread.
All worker-threads get one item from the queue, manipulate sth., write the data set to the DB and append one String (up to 14 chars) to a different queue (#2).
The queue #2 ist necessary to have all imported objects at the end of the import to mark them as new respectively delete all other items from the DB, which are currently not imported.
For DB's with a quantity of not more then 200.000 data sets everything works fine. But if there is for example a DB with 1.000.000 data sets, the memory consumption increases during the processing of the hole script up to 8 GB of RAM.
Is there a method to watch the memory consumption of threads and / or queue's? Is there a method to "clean" memory after each while-loop?
# -*- coding: utf-8 -*-
import os
import threading
import Queue
import time
from optparse import OptionParser, make_option
from decimal import Decimal
from datetime import datetime
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.conf import settings
def is_someone_alive(thread_list):
so_alive = False
for t in thread_list:
if t.is_alive():
so_alive = True
return so_alive
class insert_item(threading.Thread):
VarLock2 = threading.Lock()
def __init__(self, queue1, item_still_exist2, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
self.name = name
self.queue1 = queue1
self.item_still_exist2 = item_still_exist2
def run(self):
while not self.queue1.empty() or getItemBulkThread.isrunning:
item = self.queue1.get()
artikelobj, created = Artikel.objects.get_or_create(artikelnr=item['Nr'])
"""
manipulate data
"""
self.item_still_exist2.put(artikelobj.artikelnr)
artikelobj.save()
self.queue1.task_done()
class getItemBulkThread(threading.Thread):
isrunning = True
VarLock = threading.Lock()
def __init__(self, queue1, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
if self.options['nrStart'] != '':
self.nrab = self.options['nrStart']
else:
self.nrab = ''
self.name = name
#self.nrab = '701307'
self.queue1 = queue1
self.anz_artikel = 64
self.max_artikel = 64
self.skipped = 0
self.max_skip = 20
def run(self):
count_sleep = 0
while True:
while self.queue1.qsize() > 5000:
time.sleep(5)
count_sleep += 1
if count_sleep > 0:
print "~ Artikel-Import %(csleep)sx für 5s pausiert, da Queue-Size > 5000" % {'csleep': count_sleep}
count_sleep = 0
try:
items = getItemBulk() # from external service
except Exception as exc1:
if ('"normal" abort-condition' in str(exc1)):
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
break
elif self.anz_artikel > 1:
self.anz_artikel /= 2
continue
elif self.skipped <= self.max_skip:
self.nrab += 1
self.skipped += 1
time.sleep(5)
continue
elif self.skipped > self.max_skip:
raise Exception("[EXCEPTION] Fehler im Thread: too much items skipped")
else:
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
raise
last_item = len(items) - 1
self.nrab = items[last_item]['Nr']
for artikel in items:
artikel['katItem'] = False
self.queue1.put(artikel)
if self.anz_artikel < self.max_artikel:
self.anz_artikel *= 2
self.skipped = 0
class Command(BaseCommand):
"""
Django-mgm-command
"""
help = u'Import'
def create_parser(self, prog_name, subcommand):
"""
Create and return the ``OptionParser`` which will be used to
parse the arguments to this command.
"""
return OptionParser(prog=prog_name, usage=self.usage(subcommand),
version=self.get_version(),
option_list=self.option_list,
conflict_handler="resolve")
def handle(self, *args, **options):
startzeit = datetime.now()
anzahl_Artikel_vorher = Artikel.objects.all().count() # Artikel is a model
self.options = options
items_vorher = []
queue1 = Queue.Queue()
item_still_exists2 = Queue.Queue()
running_threads = []
thread = getItemBulkThread(queue1, name="Artikel", *args, **options)
running_threads.append(thread)
thread.daemon = True
thread.start()
anz_worker_threads = 1
anz_max_worker_threads = 5
insert_threads = [insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': i + 1}, *args, **options) for i in range(anz_worker_threads)]
for thread in insert_threads:
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
add_seconds = 5
element_grenze = 500
lastelemente = 0
asc_elemente = 0
anz_abgearbeitet = 0
while getItemBulkThread.isrunning or not queue1.empty():
time.sleep(add_seconds)
elemente = queue1.qsize()
akt_zeit = datetime.now()
diff_zeit = akt_zeit - startzeit
diff = elemente - lastelemente
anz_abgearbeitet = item_still_exists2.qsize()
art_speed = (anz_abgearbeitet / timedelta_total_seconds(diff_zeit)) * 60
ersetz_var = {'anz': elemente, 'zeit': diff_zeit, 'tstamp': akt_zeit.strftime('%Y.%m.%d-%H:%M:%S'), 'anzw': anz_worker_threads, 'diff': diff, 'anza': anz_abgearbeitet, 'art_speed': art_speed}
print("%(zeit)s vergangen - %(tstamp)s - %(anz)s Elemente in Queue, Veränderung: %(diff)s - Anz Worker: %(anzw)s - Artikel importiert: %(anza)s - Speed: %(art_speed)02d Art/Min" % ersetz_var)
if diff > 0:
asc_elemente += 1
else:
asc_elemente = 0
if asc_elemente > 2 and anz_worker_threads < anz_max_worker_threads and elemente > element_grenze:
ersetz_var = {'maxw': anz_max_worker_threads, 'nr': anz_worker_threads + 1, 'element_grenze': element_grenze}
print "~~ 2x in Folge mehr Queue-Elemente als vorher, die max. Anzahl an Workern %(maxw)s noch nicht erreicht und mehr als %(element_grenze)s Elemente in der Queue, daher Start eines neuen Workers (Nr %(nr)s)" % ersetz_var
anz_worker_threads += 1
thread = insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': anz_worker_threads}, *args, **options)
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
asc_elemente = 0
lastelemente = elemente
queue1.join()
items_nachher = []
while not item_still_exists2.empty():
item = item_still_exists2.get()
if item in items_vorher:
items_nachher.append(item)
items_vorher.remove(item)
item_still_exists2.task_done()
item_still_exists2.join()
if len(items_vorher) > 0:
Artikel.objects.filter(artikelnr__in=items_vorher).delete()
anzahl_Artikel_nachher = Artikel.objects.all().count()
anzahl_Artikel_diff = anzahl_Artikel_nachher - anzahl_Artikel_vorher
endzeit = datetime.now()
dauer = endzeit - startzeit
I've abbreviated the Code at some positions :)
Upvotes: 1
Views: 829
Reputation: 25197
A possible cause for excessive memory consumption is that you don't set a maximum size for the input queue. See the maxsize
parameter.
On a related note, you write:
In the main (handle) class, there is a while-loop that looks every 5 seconds for the quantity of elements in the queue and the quantity of running worker-threads. If there are more than 500 elements in the queue and there are less then 5 worker-threads, it starts a new worker-thread.
Creating a new thread does not necessarily increase the throughput. You should rather do some tests to determine the optimal number of threads, which may turn out to be 1.
Upvotes: 1