Reputation: 1193
I'm using the code from this GitHub Repo to scrape some images from Google Image Search results. However, there is one try-exception
block where my code gives this error:
selenium.common.exceptions.TimeoutException
D:\Downloads\google_images_downloader-master\google_images_downloader-master>[19680:952:0326/213123.951:ERROR:ssl_client_socket_impl.cc(941)] handshake failed; returned -1, SSL error code 1, net_error -100
The following is the code block:
number_of_scrolls = int((num_requested / 400) + 10)
for _ in range(number_of_scrolls):
for __ in range(10):
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
time.sleep(0.5)
try:
wait = WebDriverWait(driver, 10)
element = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@value='Show more results']")))
element.click()
except Exception as e:
print ("Less images found:", e)
break
P.S. I'm using Chrome as my webdriver
Upvotes: 0
Views: 413
Reputation: 4177
Are you sure you are facing issue becausse I have copied same code from link you provided and its working fine for me. Only I have made couple of changes
Not able to search Więcej wyników on https://www.google.com/search?q=Poecile%20montanus%20bird&source=lnms&tbm=isch
. Getting SyntaxError: Non-ASCII character '\xc4' for a driver.find_element_by_xpath("//input[@value='Więcej wyników']").click()
due ASCII char. To avoid this error i have changed xpath to driver.find_element_by_xpath("//input[@value='W']").click()
due to this we are getting ('Less images found:', NoSuchElementException())
. Please check screenshot for more details
from selenium import webdriver
import os
import json
import urllib3
import time
import shutil
searched_test_array = [
# "Parus major bird",
"Poecile montanus bird",
# "Carduelis flammea bird",
# "Parus cristatus bird",
# "Carduelis spinus bird",
# "Turdus iliacus bird",
# "Dryocopus martius bird",
# "Dendrocopos major bird",
# "Picus canus bird",
# "Picus viridis bird",
# "Dendrocopos medius bird",
# "Dendrocopos minor bird", #NOK
# "Carduelis chloris bird",
# "Pyrrhula pyrrhula bird",
# "Columba livia bird",
"Coccothraustes coccothraustes bird",
# "Carduelis cannabina bird",
# "Passer montanus bird",
# "Larus canus bird",
# "Larus argentatus bird",
"Parus caeruleus bird",
# "Regulus regulus bird",
# "Buteo buteo bird",
# "Certhia familiaris bird",
# "Certhia brachydactyla bird",
# "Emberiza calandra bird bird",
# "Corvus frugilegus bird",
# "Accipiter gentilis bird",
# "Bombycilla garrulus bird",
# "Fringilla montifringilla bird",
# "Corvus monedula bird",
# "Turdus merula bird",
# "Sitta europaea bird",
# "Accipiter nisus bird",
# "Corvus corax bird",
# "Turdus pilaris bird",
# "Emberiza schoeniclus bird",
# "Aegithalos caudatus bird",
# "Erithacus rubicola bird",
# "Carduelis flavirostris bird",
# "Streptopelia decaocto bird",
# "Parus palustris bird",
# "Parus ater bird",
# "Pica pica bird",
# "Lanius excubitor bird",
# "Troglodytes troglodytes bird",
"Carduelis carduelis bird",
# "Sturnus vulgaris bird",
# "Garrulus glandarius bird",
# "Emberiza citrinella bird",
# "Corvus corone bird",
# "Passer domesticus bird",
# "Panurus biarmicus bird",
# "Fringilla coelebs bird",
# "Larus ridibundus bird"
]
num_requested = 1000
# adding path to geckodriver to the OS environment variable
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = os.getcwd() + "/Downloads"
def main():
print ("Scrapping started")
# Create Donwload patch or delete existing!
if not os.path.exists(download_path):
os.makedirs(download_path)
# else:
# shutil.rmtree(download_path)
# os.makedirs(download_path)
# Iterate over search array
for searchtext in searched_test_array:
# Create class patch of delete existing
searchedTextDir = os.path.join(download_path, searchtext.replace(" ", "_"))
if not os.path.exists(searchedTextDir):
os.makedirs(searchedTextDir)
# else:
# shutil.rmtree(searchedTextDir)
# os.makedirs(searchedTextDir)
# Prepare search URL. searchtext is a name of a class.
url = "https://www.google.com/search?q=" + searchtext + "&source=lnms&tbm=isch"
# Start Firefox
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
# Open URL
driver.get(url)
extensions = {"jpg", "jpeg", "png", "gif"}
img_count = 0
downloaded_img_count = 0
# I have to do some magic math to make web browser scroll down the search box.
number_of_scrolls = int((num_requested / 400) + 10)
for _ in range(number_of_scrolls):
for __ in range(10):
# And scroll scroll scroll to let Google Json load images
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
# to load next 400 images
time.sleep(0.5)
try:
# Look for a button down the page for more search results.
# For English version use: //input[@value='Show more results']
driver.find_element_by_xpath("//input[@value='W']").click()
except Exception as e:
print ("Less images found:", e)
break
# Get URLs of all images on the page
imges = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
print ("Total images:", len(imges), "\n")
# Start iterating over found URLs
for img in imges:
img_count += 1
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_type = json.loads(img.get_attribute('innerHTML'))["ity"]
print ("Downloading image", img_count, ": ", img_url, img_type)
try:
# Thy to save image on HDD
if img_type not in extensions:
img_type = "jpg"
http = urllib3.PoolManager()
# Write image to hdd. Don't forget about timeout!
response = http.request('GET', img_url, timeout=2)
f = open(searchedTextDir + "/" + str(downloaded_img_count) + "." + img_type, "wb")
f.write(response.data)
f.close
downloaded_img_count += 1
except Exception as e:
print ("Download failed:", e)
finally:
print
if downloaded_img_count >= num_requested:
break
print ("Total downloaded: ", downloaded_img_count, "/", img_count)
driver.quit()
time.sleep(0.5)
print ("Scrapping done")
if __name__ == "__main__":
main()
To click on show more Show more results
add for _ in range(number_of_scrolls):
for __ in range(10):
# And scroll scroll scroll to let Google Json load images
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
# to load next 400 images
time.sleep(0.5)
driver.find_element_by_xpath("//input[@class='mye4qd']").click()
time.sleep(5)
Upvotes: 1
Reputation: 31
You can't catch TimeoutException with:
except Exception as e:
You should use:
except TimeoutException:
And also you'll have to import that exception from Selenium:
from selenium.common.exceptions import TimeoutException
Upvotes: 0