python urllib2 multi downloading

Question

How would I make the below script download multiple links at once instead of one at a time with urllib2?

python:

from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re

print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
  url = link.get('href')
  name = urlparse.urlparse(url).path.split('/')[-1]
  dirname = urlparse.urlparse(url).path.split('.')[-1]
  f = urllib2.urlopen(url)
  s = f.read()
  if (os.path.isdir(dirname) == 0): 
    os.mkdir(dirname)
  soup = BeautifulSoup(s)
  articleTag = soup.html.body.article
  converted = str(articleTag)
  full_path = os.path.join(dirname, name)
  open(full_path, 'w').write(converted)
  print(name)
print("DOWNLOADS COMPLETE!")

links.html

http://www.youversion.com/bible/gen.1.nmv-fas

http://www.youversion.com/bible/gen.2.nmv-fas

http://www.youversion.com/bible/gen.3.nmv-fas

http://www.youversion.com/bible/gen.4.nmv-fas

http://www.youversion.com/bible/gen.5.nmv-fas

http://www.youversion.com/bible/gen.6.nmv-fas

dc5553 · Accepted Answer

Blainer, try threading.

Here's a good practical example

http://www.ibm.com/developerworks/aix/library/au-threadingpython/

Then reference the python std library as well

http://docs.python.org/library/threading.html

If you look on the practical example it actually has a sample of threaded version of urllib2 concurrent downloads. I I went ahead and took you a few step more into the process, you will have to work with the part that says fix this to further parse your html out..

#!/usr/bin/env python

import Queue
import threading
import urllib2
import time
import htmllib, formatter

class LinksExtractor(htmllib.HTMLParser):
    # derive new HTML parser

    def __init__(self, formatter):        
        # class constructor
        htmllib.HTMLParser.__init__(self, formatter)  
        # base class constructor
        self.links = []        
        # create an empty list for storing hyperlinks

    def start_a(self, attrs) :  # override handler of ... tags
        # process the attributes
        if len(attrs) > 0 :
            for attr in attrs :
                if attr[0] == "href":         
                    # ignore all non HREF attributes
                    self.links.append(attr[1]) # save the link info in the list

    def get_links(self) :     
        # return the list of extracted links
        return self.links

format = formatter.NullFormatter()
htmlparser = LinksExtractor(format)

data = open("links.html")
htmlparser.feed(data.read())
htmlparser.close()

hosts = htmlparser.links

queue = Queue.Queue()

class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()

            ####################################
            ############FIX THIS PART###########
            #VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV#

            url = urllib2.urlopen(host)
            morehtml = url.read() # your own your own with this

            #signals to queue job is done
            self.queue.task_done()

start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance 
    for i in range(5):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()

        #populate queue with data   
    for host in hosts:
        queue.put(host)

    #wait on the queue until everything has been processed     
    queue.join()

main()
print "Elapsed Time: %s" % (time.time() - start)

python urllib2 multi downloading

Answers (1)

Related Questions