Reputation: 4271
I am writing a scraper that downloads all the image files from a HTML page and saves them to a specific folder. All the images are part of the HTML page.
Upvotes: 48
Views: 106772
Reputation: 11
import urllib.request as req
with req.urlopen(image_link) as d, open(image_location, "wb") as image_object:
data = d.read()
image_object.write(data)
Upvotes: 1
Reputation: 7632
Removing some lines of code, you'll get only the images img
tags.
Uses Python 3+ Requests, BeautifulSoup and other standard libraries.
import os, sys
import requests
from urllib import parse
from bs4 import BeautifulSoup
import re
def savePageImages(url, imagespath='images'):
def soupfindnSave(pagefolder, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find):
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = parse.urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
session = requests.Session()
#... whatever other requests config you need here
response = session.get(url)
soup = BeautifulSoup(response.text, "html.parser")
soupfindnSave(imagespath, 'img', 'src')
Use like this bellow to save the google.com
page images in a folder google_images
:
savePageImages('https://www.google.com', 'google_images')
Upvotes: 1
Reputation: 14121
Here is some code to download all the images from the supplied URL, and save them in the specified output folder. You can modify it to your own needs.
"""
dumpimages.py
Downloads all the images on the supplied URL, and saves them to the
specified output file ("/test/" by default)
Usage:
python dumpimages.py http://example.com/ [output]
"""
from bs4 import BeautifulSoup as bs
from urllib.request import (
urlopen, urlparse, urlunparse, urlretrieve)
import os
import sys
def main(url, out_folder="/test/"):
"""Downloads all the images at 'url' to /test/"""
soup = bs(urlopen(url))
parsed = list(urlparse(url))
for image in soup.findAll("img"):
print("Image: %(src)s" % image)
filename = image["src"].split("/")[-1]
parsed[2] = image["src"]
outpath = os.path.join(out_folder, filename)
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlunparse(parsed), outpath)
def _usage():
print("usage: python dumpimages.py http://example.com [outpath]")
if __name__ == "__main__":
url = sys.argv[-1]
out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
Edit: You can specify the output folder now.
Upvotes: 91
Reputation: 7130
If the request need an authorization refer to this one:
r_img = requests.get(img_url, auth=(username, password))
f = open('000000.jpg','wb')
f.write(r_img.content)
f.close()
Upvotes: 1
Reputation: 7743
Ryan's solution is good, but fails if the image source URLs are absolute URLs or anything that doesn't give a good result when simply concatenated to the main page URL. urljoin recognizes absolute vs. relative URLs, so replace the loop in the middle with:
for image in soup.findAll("img"):
print "Image: %(src)s" % image
image_url = urlparse.urljoin(url, image['src'])
filename = image["src"].split("/")[-1]
outpath = os.path.join(out_folder, filename)
urlretrieve(image_url, outpath)
Upvotes: 14
Reputation: 2706
And this is function for download one image:
def download_photo(self, img_url, filename):
file_path = "%s%s" % (DOWNLOADED_IMAGE_PATH, filename)
downloaded_image = file(file_path, "wb")
image_on_web = urllib.urlopen(img_url)
while True:
buf = image_on_web.read(65536)
if len(buf) == 0:
break
downloaded_image.write(buf)
downloaded_image.close()
image_on_web.close()
return file_path
Upvotes: 9
Reputation: 127467
Use htmllib to extract all img tags (override do_img), then use urllib2 to download all the images.
Upvotes: 3
Reputation: 2622
You have to download the page and parse html document, find your image with regex and download it.. You can use urllib2 for downloading and Beautiful Soup for parsing html file.
Upvotes: 9