blackmamba
blackmamba

Reputation: 2002

Download files from url parallely in python

I have some links in a database which I want to download parallely. I tried doing it serially but it took too much time. I have around 1877 links.

I tried this code for running the downloads parallely but it throws an error: failed: 'tuple' object has no attribute 'read'

#!/usr/bin/env python

import urllib
from stream import ThreadPool

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
  'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]

def retrieve(urls):
    for url in urls:
    print url,' '
    res = urllib.urlretrieve(url).read()
    yield url, res

if __name__ == '__main__':
    retrieved = URLs >> ThreadPool(retrieve, poolsize=7)
    for url, content in retrieved:
        print '%r is %d bytes' % (url, len(content))
    for url, exception in retrieved.failure:
        print '%r failed: %s' % (url, exception)

I tried this as well:

import urllib
import tldextract
from multiprocessing.pool import ThreadPool

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
   'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]


def dwld(url):
  print url
  res = urllib.urlopen(url).read() 
  filename = tldextract.extract(url)
  with open(filename.domain, 'wb') as fh:
     fh.write(res)
  return url 

pool = ThreadPool(processes = 4)
pool.map(dwld, URLs)

Gives me Traceback (most recent call last): File "dwld_thread.py", line 26, in pool.map(dwld, URLs) File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 148, in map return self.map_async(func, iterable, chunksize).get() File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 422, in get raise self._value IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known

Upvotes: 0

Views: 2842

Answers (4)

tc_qa
tc_qa

Reputation: 37

What about using multiprocessing ?

Sample code:

#! /usr/bin/env python

# -*- coding: utf-8 -*-


import sys
import urllib
from multiprocessing import Pool

import os

POOL = 8
PDFS_DOWNLOAD_DIR = 'pdfs'
PDF_LINKS = sys.argv[1]


class DownloadFiles(object):
    def __init__(self):
        self.pdf_links = self.read_links_from_file()
        self.create_download_dir()

    def create_download_dir(self):
        try:
            if not os.path.exists(PDFS_DOWNLOAD_DIR):
                os.makedirs(PDFS_DOWNLOAD_DIR)
        except IOError as e:
            exit()

    def read_links_from_file(self):
        try:
            with open(PDF_LINKS, 'r') as f:
                return list(set([x.strip() for x in f]))
        except (IndexError, IOError) as e:
            exit()

    def get_file(self, link):

        filename = link.split('/')[-2]

        print('Downloading file --> "{filename}"'.format(
            filename=filename
        ))

        urllib.urlretrieve(link, filename='{pdfs_data}/{filename}'.format(
            pdfs_data=PDFS_DOWNLOAD_DIR,
            filename=filename
        ))

    def download(self):

        pool = Pool(POOL)
        pool.map(self.get_file, self.pdf_links)

        pool.close()
        pool.join()

        print('\nSuccessfully downloaded files from given source!\n')


d = DownloadFiles()
d.download()

Upvotes: 0

Torxed
Torxed

Reputation: 23500

from threading import *
from time import sleep
# if Python2:
import urllib
# if Python3:
# import urllib.request

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
  'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]

class worker(Thread):
    def __init__(self, link):
        Thread.__init__(self)
        self.link = link
        self.start()
    def run(self):
        # if Python2:
        res = urllib.urlopen(url).read() # as mentioned by @DhruvPathak
        # if Python3:
        # res = urllib.request.urlopen(url).read()
        with open(url, 'rb') as fh:
            fh.write(res) # store fetched data in a file called <link>

for url in urls:
    while len(enumerate()) > 500:
        sleep(0.25)
    worker(url)

while len(enumerate()) > 1:
    sleep(0.25) # wait for all threads to finish

Upvotes: 0

abarnert
abarnert

Reputation: 366103

I have no idea what that stream.ThreadPool is that you're using, or what its API is… but the problem is obvious:

res = urllib.urlretrieve(url).read()

If you look at the doc for urlretrieve:

Return a tuple (filename, headers) where filename is the local file name under which the object can be found…

You obviously can't call read on that. If you want to download to a local file, using this legacy API, and then read that file, you can:

filename, headers = urllib.urlretrieve(url)
with open(filename) as f:
    res = f.read()

But why? Just use urllib2.urlopen, which "returns a file-like object with two additional methods", so you can just call read on it, and you won't be creating a temporary file, and you're not using an old function that wasn't quite designed right that nobody has maintained in years.


But Python has a nice ThreadPoolExecutor built into the standard library. And if you look at the very first example they show you, it's exactly what you're trying to do.

Unfortunately, you're using Python 2.x, which doesn't have the concurrent.futures module. Fortunately, there is a backport on PyPI that works with 2.5+.

Python also has multiprocessing.dummy.Pool (also available under the undocumented, but probably more readable, name multiprocessing.ThreadPool). But if you're willing to go outside the stdlib for some module that you apparently aren't sure how to use and that I've never heard of, I'm guessing you won't have any problem using futures. So:

import futures
import urllib2

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
  'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]

def load_url(url):
    return urllib2.urlopen(url).read()

if __name__ == '__main__':
    with futures.ThreadPoolExecutor(max_workers=7) as executor:
        fmap = dict((executor.submit(load_url, url), url) for url in URLs)
        for f in futures.as_completed(fmap):
            url = fmap[f]
            try:
                content = f.result()
            except Exception as exception:
                print '%r failed: %s' % (url, exception)
            else:
                print '%r is %d bytes' % (url, len(content))

Upvotes: 3

DhruvPathak
DhruvPathak

Reputation: 43265

urllib.urlretrieve(url).read() should be urllib.urlopen(url).read()

Upvotes: 0

Related Questions