danesh karimi
danesh karimi

Reputation: 35

python how download all file image,css, js from web url?

How can I list all file of a URL website in Python and download them in folder?

import uuid
rnd_str = uuid.uuid4().hex  
main_name="download_"+rnd_str
main_folder=main_name+"/"
dir = main_folder
if os.path.exists(dir):
    shutil.rmtree(dir)
os.mkdir(main_folder)

site = 'http://digipaz.com/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')

urls = [img['src'] for img in img_tags]
for url in urls:
    filename = re.search(r'/([\w_-]+[.](jpg|gif|png|css))$', url)
    with open( '/home/danesh20016/public_html/ts/'+main_folder+filename.group(1), 'wb') as f:
        if 'http' not in url:
            # sometimes an image source can be relative 
            # if it is provide the base url which also happens 
            # to be the site variable atm. 
            url = '{}{}'.format(site, url)
        response = requests.get(url)
        f.write(response.content)

i can download image but not file js,css

Upvotes: 0

Views: 2394

Answers (1)

sirmagid
sirmagid

Reputation: 1130

from bs4 import BeautifulSoup
from pip._vendor import requests
import os
import shutil
import re
from bs4 import BeautifulSoup
import uuid

rnd_str = uuid.uuid4().hex
main_name="download_"+rnd_str
main_folder= main_name+"/"
dir = main_folder

if os.path.exists(dir):
    shutil.rmtree(dir)
os.mkdir(main_folder)


site = 'http://digipaz.com/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
#find all jpg,png,gif
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
#print (urls)
for url in urls:
    filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
   #with open( '/home/danesh20016/public_html/ts/'+main_folder+filename.group(1), 'wb') as f:
    print (url)
    with open( '/home/danesh20016/public_html/ts/'+main_folder+filename.group(1), 'wb') as f:
    #with open(main_folder+filename.group(1), 'wb') as f:
        if 'http' not in url:
            # sometimes an image source can be relative
            # if it is provide the base url which also happens
            # to be the site variable atm.
            url = '{}{}'.format(site, url)
        response = requests.get(url)
        f.write(response.content)

 #find all css  
 for link in soup.findAll('link', href=True):
    #print ("Found the URL:", link['href'])
    if re.search(".css", link['href']):
       print (link['href'])
       with open('/home/danesh20016/public_html/ts/'+main_folder+ filename.group(1), 'wb') as f:
           # with open(main_folder+filename.group(1), 'wb') as f:
           if 'http' not in url:
               # sometimes an image source can be relative
               # if it is provide the base url which also happens
               # to be the site variable atm.
               url = '{}{}'.format(site, link['href'])
           response = requests.get(url)
           f.write(response.content)
#find all js
link_js = [sc["src"] for sc in soup.find_all("script",src=True)]
for link in link_js:
    print ("Found the URL:", link)
    with open('/home/danesh20016/public_html/ts/'+main_folder+ filename.group(1), 'wb') as f:
        # with open(main_folder+filename.group(1), 'wb') as f:
        if 'http' not in url:
            # sometimes an image source can be relative
            # if it is provide the base url which also happens
            # to be the site variable atm.
            url = '{}{}'.format(site, link)
        response = requests.get(url)
        f.write(response.content)

Upvotes: 2

Related Questions