Reputation: 55
Recently, I have been trying to learn how to web scrape in order to download all the images from my schools directory. However, within the elements, they are not storing the images under the img tag and instead have them ALL under this: background-image: url("/common/pages/GalleryPhoto.aspx?photoId=323070&width=180&height=180");
Is there anyway to get past this?? Never seen the .aspx before.
Here is the code that I am currently using to download images from websites.
import os, requests, bsf n4, webbrowser, random
url = 'https://jhs.lsc.k12.in.us/staff_directory'
res = requests.get(url)
try:
res.raise_for_status()
except Exception as exc:
print('Sorry an error occured:', exc)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
element = soup.select('background-image')
for i in range(len(element)):
url = element[i].get('img')
name = random.randrange(1, 25)
file = open(str(name) + '.jpg', 'wb')
res = requests.get(url)
for chunk in res.iter_content(10000):
file.write(chunk)
file.close()
print('done')
Upvotes: 0
Views: 646
Reputation: 44203
I will get you well on your way. You cannot easily do this just with requests
because the site's content is loaded after the main page by JavaScript executing AJAX calls to load more content that updates the DOM
. You really need to use a tool such as [Selenium]
(https://www.selenium.dev/downloads/) that will drive a web browser such as Chrome (or your choice) where you can wait for the elements you are looking for to appear. Even then it is not that straight forward. First, the photos are across multiple pages. The code below only processes the first page. You would then need to drive the browser to click on areas of the page to force the next page of photos to load.
Once the page content has loaded, I use Selenium's ability to find elements by its XPATH to retrieve all the <span>
elements that contain the style="background-image: url('some-value'"
. I then have to retrieve from these elements the style
attribute and from that I have to look for the background-image
url specification, which could be an empty string. Finally, I use a thread pool to retrieve all the URLs concurrently using a request
Session
object, which is more efficient. BeautfulSoup
is not used at all. Finally, I do not use a randrange
call to generate the name of the file since there could be a collision and you wouldn't want to attempt to store two images with the same name (and certainly not concurrently).
from selenium import webdriver
import requests
from concurrent.futures import ThreadPoolExecutor
import re
url = 'https://jhs.lsc.k12.in.us/staff_directory'
def download_image(s, i, this_url):
res = s.get('https://jhs.lsc.k12.in.us' + this_url)
name = f'out_{i}.jpg' # don't use randrange because of possibility of duplicates
with open(name, 'wb') as file:
for chunk in res.iter_content(10000):
file.write(chunk)
def main():
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
try:
driver.implicitly_wait(3)
driver.get(url)
elements = driver.find_elements_by_xpath('/html/body/form/div[4]/div[2]/div[5]/div[2]/div[2]/div[2]/div/div[2]/div/div[3]/div[2]/div/div/div[2]/div[2]/div[1]/ul/li/div[1]/div[1]/span')
cnt = len(elements)
if cnt == 0:
print('No images found.')
return
with requests.Session() as s:
MAX_THREADS = 100 # some suitable maximum number for you configuration
N_WORKERS = min(cnt, MAX_THREADS)
with ThreadPoolExecutor(max_workers=N_WORKERS) as executor:
futures = []
for i in range(cnt):
style = elements[i].get_attribute('style')
m = re.search(r'background-image: url\("(.*?)"\)', style)
if m:
this_url = m[1]
if this_url != '':
futures.append(executor.submit(download_image, s, i, this_url))
# wait for each task to complete:
for future in futures:
try:
future.result()
except Exception as e:
print('task completed with exception:', e)
print('done')
except Exeption as e:
print(e)
finally:
driver.quit()
main()
Version That Doesn't Use a Thread Pool
from selenium import webdriver
import requests
import re
url = 'https://jhs.lsc.k12.in.us/staff_directory'
def download_image(s, i, this_url):
res = s.get('https://jhs.lsc.k12.in.us' + this_url)
name = f'out_{i}.jpg' # don't use randrange because of possibility of duplicates
with open(name, 'wb') as file:
for chunk in res.iter_content(10000):
file.write(chunk)
def main():
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
try:
driver.implicitly_wait(3)
driver.get(url)
elements = driver.find_elements_by_xpath('/html/body/form/div[4]/div[2]/div[5]/div[2]/div[2]/div[2]/div/div[2]/div/div[3]/div[2]/div/div/div[2]/div[2]/div[1]/ul/li/div[1]/div[1]/span')
cnt = len(elements)
if cnt == 0:
print('No images found.')
return
with requests.Session() as s:
for i in range(cnt):
style = elements[i].get_attribute('style')
m = re.search(r'background-image: url\("(.*?)"\)', style)
if m:
this_url = m[1]
if this_url != '':
try:
download_image(s, i, this_url)
except Exception as e:
print('call completed with exception:', e)
print('done')
except Exeption as e:
print(e)
finally:
driver.quit()
main()
Update to Download 6 Pages of Images (or All 120 Pages)
from selenium import webdriver
import requests
from concurrent.futures import ThreadPoolExecutor
import re
def download_image(s, file_no, this_url):
res = s.get('https://jhs.lsc.k12.in.us' + this_url)
name = f'out_{file_no}.jpg' # don't use randrange because of possibility of duplicates
with open(name, 'wb') as file:
for chunk in res.iter_content(50000): # use larger chunksize
file.write(chunk)
file_no = 0
def download_page_images(s, executor, elements):
global file_no
futures = []
for element in elements:
style = element.get_attribute('style')
m = re.search(r'background-image: url\("(.*?)"\)', style)
if m:
this_url = m[1]
if this_url != '':
file_no += 1
futures.append(executor.submit(download_image, s, file_no, this_url))
# wait for each task to complete:
for future in futures:
try:
future.result()
except Exception as e:
print('task completed with exception:', e)
def main():
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
try:
driver.implicitly_wait(3)
# first page:
driver.get('https://jhs.lsc.k12.in.us/staff_directory')
elements = driver.find_elements_by_xpath('/html/body/form/div[4]/div[2]/div[5]/div[2]/div[2]/div[2]/div/div[2]/div/div[3]/div[2]/div/div/div[2]/div[2]/div[1]/ul/li/div[1]/div[1]/span')
cnt = len(elements)
if cnt == 0:
print('No images found.')
return
with requests.Session() as s:
MAX_THREADS = 100 # some suitable maximum number for you configuration
N_WORKERS = min(cnt, MAX_THREADS)
with ThreadPoolExecutor(max_workers=N_WORKERS) as executor:
download_page_images(s, executor, elements) # first page elements
#while True: # uncomment to download all the remaining pages or ...
for _ in range(5): # to download up to just 5 more pages
# look for right arrow (more)
elements = driver.find_elements_by_class_name('fa-arrow-right')
if not elements: # this was the last page:
break
elements[0].click() # there is really only one right arrow
# look for next set of pictures on next page
elements = driver.find_elements_by_xpath('/html/body/form/div[4]/div[2]/div[5]/div[2]/div[2]/div[2]/div/div[2]/div/div[3]/div[2]/div/div/div[2]/div[2]/div[1]/ul/li/div[1]/div[1]/span')
assert elements
download_page_images(s, executor, elements)
print('done')
except Exeption as e:
print(e)
finally:
driver.quit()
main()
Upvotes: 2
Reputation: 11
The simple way to take the image without work around is open the website using Firefox, Tools > Page Info > Media.
Under the Media section you will see the right full path from all medias including app loaded photos.
Copy the full address of the image and paste in the browser to save to your preferred location.
Upvotes: 0