Reputation: 2702
How would I make the below script download multiple links at once instead of one at a time with urllib2?
python:
from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
dirname = urlparse.urlparse(url).path.split('.')[-1]
f = urllib2.urlopen(url)
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
articleTag = soup.html.body.article
converted = str(articleTag)
full_path = os.path.join(dirname, name)
open(full_path, 'w').write(converted)
print(name)
print("DOWNLOADS COMPLETE!")
links.html
<a href="http://www.youversion.com/bible/gen.1.nmv-fas">http://www.youversion.com/bible/gen.1.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.2.nmv-fas">http://www.youversion.com/bible/gen.2.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.3.nmv-fas">http://www.youversion.com/bible/gen.3.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.4.nmv-fas">http://www.youversion.com/bible/gen.4.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.5.nmv-fas">http://www.youversion.com/bible/gen.5.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.6.nmv-fas">http://www.youversion.com/bible/gen.6.nmv-fas</a>
Upvotes: 2
Views: 2244
Reputation: 1293
Blainer, try threading.
Here's a good practical example
http://www.ibm.com/developerworks/aix/library/au-threadingpython/
Then reference the python std library as well
http://docs.python.org/library/threading.html
If you look on the practical example it actually has a sample of threaded version of urllib2 concurrent downloads. I I went ahead and took you a few step more into the process, you will have to work with the part that says fix this to further parse your html out..
#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
import htmllib, formatter
class LinksExtractor(htmllib.HTMLParser):
# derive new HTML parser
def __init__(self, formatter):
# class constructor
htmllib.HTMLParser.__init__(self, formatter)
# base class constructor
self.links = []
# create an empty list for storing hyperlinks
def start_a(self, attrs) : # override handler of <A ...>...</A> tags
# process the attributes
if len(attrs) > 0 :
for attr in attrs :
if attr[0] == "href":
# ignore all non HREF attributes
self.links.append(attr[1]) # save the link info in the list
def get_links(self) :
# return the list of extracted links
return self.links
format = formatter.NullFormatter()
htmlparser = LinksExtractor(format)
data = open("links.html")
htmlparser.feed(data.read())
htmlparser.close()
hosts = htmlparser.links
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
####################################
############FIX THIS PART###########
#VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV#
url = urllib2.urlopen(host)
morehtml = url.read() # your own your own with this
#signals to queue job is done
self.queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
#wait on the queue until everything has been processed
queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
Upvotes: 1