Reputation:
As you surely know, I can do multithreading to download files from the Internet faster. But if I send lots of requests to the same website, I could be black listed.
So could you help me to implement something like "I've got a list of urls. I want you to download all of these files but if 10 downloads are already running, wait for a slot."
I'll appreciate any help. Tk.
binoua
This is the code I'm using (doesn't work).
class PDBDownloader(threading.Thread):
prefix = 'http://www.rcsb.org/pdb/files/'
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
self.pdbid = None
self.urlstr = ''
self.content = ''
def run(self):
while True:
self.pdbid = self.queue.get()
self.urlstr = self.prefix + pdbid + '.pdb'
print 'downloading', pdbid
self.download()
filename = '%s.pdb' %(pdbid)
f = open(filename, 'wt')
f.write(self.content)
f.close()
self.queue.task_done()
def download(self):
try:
f = urllib2.urlopen(self.urlstr)
except urllib2.HTTPError, e:
msg = 'HTTPError while downloading file %s at %s. '\
'Details: %s.' %(self.pdbid, self.urlstr, str(e))
raise OstDownloadException, msg
except urllib2.URLError, e:
msg = 'URLError while downloading file %s at %s. '\
'RCSB erveur unavailable.' %(self.pdbid, self.urlstr)
raise OstDownloadException, msg
except Exception, e:
raise OstDownloadException, str(e)
else:
self.content = f.read()
if __name__ == '__main__':
pdblist = ['1BTA', '3EAM', '1EGJ', '2BV9', '2X6A']
for i in xrange(len(pdblist)):
pdb = PDBDownloader(queue)
pdb.setDaemon(True)
pdb.start()
while pdblist:
pdbid = pdblist.pop()
queue.put(pdbid)
queue.join()
Upvotes: 1
Views: 597
Reputation: 222862
Using threads doesn't "download files from the Internet faster". You have only one network card and one internet connection so that's just not true.
The threads are being used to wait, and you can't wait faster.
You can use a single thread and be as fast, or even faster -- Just don't wait for the response of one file before starting another. In other words, use asynchronous, non-blocking network programming.
Here's a complete script that uses twisted.internet.task.coiterate
to start multiple downloads at the same time, without using any kind of threading, and respecting the pool size (I'm using 2 simultaneous downloads for the demonstration, but you can change the size):
from twisted.internet import defer, task, reactor
from twisted.web import client
from twisted.python import log
@defer.inlineCallbacks
def deferMap(job, dataSource, size=1):
successes = []
failures = []
def _cbGather(result, dataUnit, succeeded):
"""This will be called when any download finishes"""
if succeeded:
# you could save the file to disk here
successes.append((dataUnit, result))
else:
failures.append((dataUnit, result))
@apply
def work():
for dataUnit in dataSource:
d = job(dataUnit).addCallbacks(_cbGather, _cbGather,
callbackArgs=(dataUnit, True), errbackArgs=(dataUnit, False))
yield d
yield defer.DeferredList([task.coiterate(work) for i in xrange(size)])
defer.returnValue((successes, failures))
def printResults(result):
successes, failures = result
print "*** Got %d pages total:" % (len(successes),)
for url, page in successes:
print ' * %s -> %d bytes' % (url, len(page))
if failures:
print "*** %d pages failed download:" % (len(failures),)
for url, failure in failures:
print ' * %s -> %s' % (url, failure.getErrorMessage())
if __name__ == '__main__':
import sys
log.startLogging(sys.stdout)
urls = ['http://twistedmatrix.com',
'XXX',
'http://debian.org',
'http://python.org',
'http://python.org/foo',
'https://launchpad.net',
'noway.com',
'somedata',
]
pool = deferMap(client.getPage, urls, size=2) # download 2 at once
pool.addCallback(printResults)
pool.addErrback(log.err).addCallback(lambda ign: reactor.stop())
reactor.run()
Note that I included some bad urls on purpose so we can see some failures in the result:
...
2010-06-29 08:18:04-0300 [-] *** Got 4 pages total:
2010-06-29 08:18:04-0300 [-] * http://twistedmatrix.com -> 16992 bytes
2010-06-29 08:18:04-0300 [-] * http://python.org -> 17207 bytes
2010-06-29 08:18:04-0300 [-] * http://debian.org -> 13820 bytes
2010-06-29 08:18:04-0300 [-] * https://launchpad.net -> 18511 bytes
2010-06-29 08:18:04-0300 [-] *** 4 pages failed download:
2010-06-29 08:18:04-0300 [-] * XXX -> Connection was refused by other side: 111: Connection refused.
2010-06-29 08:18:04-0300 [-] * http://python.org/foo -> 404 Not Found
2010-06-29 08:18:04-0300 [-] * noway.com -> Connection was refused by other side: 111: Connection refused.
2010-06-29 08:18:04-0300 [-] * somedata -> Connection was refused by other side: 111: Connection refused.
...
Upvotes: 4
Reputation: 304185
Use a thread pool with a shared list of urls. Each thread tries to pop
a url from the list and download it until none are left. pop()
from a list is threadsafe
while True:
try:
url = url_list.pop()
# download URL here
except IndexError:
break
Upvotes: 0