Reputation: 109
I'm trying to scrape Google to get some pictures (like a 100 maybe)from a list (list_name). But my code returns only 20 and I can't figure why.
Here is my code:
import os
import requests
from bs4 import BeautifulSoup
liste_name = ['blood orange','apple golden']
for name in liste_name:
name_splited = name.split(" ")
if len(name_splited) > 1:
full_name = name_splited[0] + "_" + name_splited[1]
path = "./Dataset/Trainset/" + full_name + "/"
name = name_splited[0] + "%" + name_splited[1]
url = "https://www.google.ch/search?site=webhp&tbm=isch&source=hp&q=" + \
name + "&oq=" + name + "biw=1280&bih=579&num=100"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
list_res_image = soup.find_all("img")
if not os.path.exists(path):
os.makedirs(path)
for index, lien in enumerate(list_res_image):
link = lien['src']
test = False
while not test:
try:
img = requests.get(link).content
test = True
except requests.exceptions.SSLError:
pass
with open(path + full_name + str(index) + ".png", "wb") as f:
f.write(img)
Upvotes: 0
Views: 660
Reputation: 99
You can use selenium
or playwright
to get absolutely all images. Also, you can use the "ijn" URL parameter
that defines the page number to get e.g 0 is the first page, 1 is the second, and so on. The parameter should be greater than or equal to 0.
However, we can also do this with BeautifulSoup
, using regular expressions, from inline JSON.
In order not to make a request for a specific link, you can set parameters
that can always be changed for subsequent search:
# this URL params is taken from the actual Google search URL
# and transformed to a more readable format
params = {
"q": "blood orange", # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us", # country where search comes from
}
With the help of regular expressions, we gradually filter out the inline JSON code to find images results:
# https://regex101.com/r/kyLU8S/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/GbVLOq/1
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)
# https://regex101.com/r/LzhCYM/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
Full code and example in the online IDE
import requests, re, json, lxml
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
queries = ['blood orange','apple golden']
google_images = []
for query in queries:
print(f'Extracting images for query: {query}')
params = {
"q": query, # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us", # country where search comes fro
}
html = requests.get("https://google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
all_script_tags = soup.select("script")
# https://regex101.com/r/48UZhY/4
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/VPz7f2/1
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)
# https://regex101.com/r/Jt5BJW/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
]
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
google_images.append({
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
"source": metadata.select_one(".fxgdke").text,
"thumbnail": thumbnail,
"original": original
})
print(json.dumps(google_images, indent=2, ensure_ascii=False))
Example output
[
{
"title": "Glazed Blood Orange and Lavender Loaf | Olive & Mango",
"link": "https://www.oliveandmango.com/glazed-blood-orange-and-lavender-loaf",
"source": "oliveandmango.com",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTT_92Ydz2t-niZ8bF7tExYqVSYzeLldzXQjg&usqp=CAU",
"original": "https://d33wubrfki0l68.cloudfront.net/09a0f8357a7f0d667b7b20537b74886649cc35cc/9bb85/images/uploads/2019_02_09_glazed_blood_orange_and_lavender_loaf_3.jpg"
},
{
"title": "Blood Orange Gin & Tonic – A Couple Cooks",
"link": "https://www.acouplecooks.com/blood-orange-cocktail/",
"source": "acouplecooks.com",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcScgVolM0b-ilf63TlcTSJTSpkV_3HX9iQh5Q&usqp=CAU",
"original": "https://www.acouplecooks.com/wp-content/uploads/2021/01/Blood-Orange-Cocktail-001.jpg"
},
{
"title": "Fresh Golden Delicious Apples - Shop Fruit at H-E-B",
"link": "https://www.heb.com/product-detail/fresh-golden-delicious-apples/377503",
"source": "heb.com",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT1y_FmZ56YcN6NeVyzT-TKHh54HgtByvSSFpcxVIBRjYB-l9HDaE_rMDrmKlI6IcvfTZs&usqp=CAU",
"original": "https://images.heb.com/is/image/HEBGrocery/000377503"
},
{
"title": "Golden Delicious - Wikipedia",
"link": "https://en.wikipedia.org/wiki/Golden_Delicious",
"source": "en.wikipedia.org",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSdSLrBTzdhPzJp-AbZftn8iTm-6OR_PFLSmqJqiZyfjsPGMB6lryZdb8tF3rYiwxmTJC0&usqp=CAU",
"original": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Golden_Delicious_apples.jpg/1200px-Golden_Delicious_apples.jpg"
},
# ...
]
Also you can use Google Images API from SerpApi. It's a paid API with the free plan. The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Example to integrate:
from serpapi import GoogleSearch
import os, json
image_results = []
queries = ['blood orange','apple golden']
for query in queries:
print(f'extracting images for query: {query}')
# search query parameters
params = {
"engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu...
"q": query, # search query
"tbm": "isch", # image results
"num": "100", # number of images per page
"ijn": 0, # page number: 0 -> first page, 1 -> second...
"api_key": os.getenv("API_KEY") # your serpapi api key
# other query parameters: hl (lang), gl (country), etc
}
search = GoogleSearch(params) # where data extraction happens
images_is_present = True
while images_is_present:
results = search.get_dict() # JSON -> Python dictionary
# checks for "Google hasn't returned any results for this query."
if "error" not in results:
for image in results["images_results"]:
if image["original"] not in image_results:
image_results.append(image["original"])
# update to the next page
params["ijn"] += 1
else:
print(results["error"])
images_is_present = False
print(json.dumps(image_results, indent=2))
Output:
[
"https://www.researchgate.net/publication/340952507/figure/fig1/AS:885003558846464@1588012699713/Apple-varieties-Red-Delicious-Granny-Smith-Golden-Delicious-respectively-Sekil-1.jpg",
"https://goodfruitguide.co.uk/wp-content/uploads/Apple-Golden-Delicious-ZA-DSC_0021-cr-sq-300x300.jpg",
"http://newenglandapples.files.wordpress.com/2011/12/img_6239.jpg",
"https://i5.peapod.com/c/IY/IY47G.png",
"https://cdn.shopify.com/s/files/1/1251/5173/products/goldendelicious_1024x1024.jpeg?v=1572074514",
"https://www.gannett-cdn.com/-mm-/a5076e7a43a0cec6129489319d0fb728e2cd1814/c=0-264-5184-3193/local/-/media/2018/01/03/Phoenix/Phoenix/636505888078540454-opal-apples-8.JPG?width=660&height=373&fit=crop&format=pjpg&auto=webp",
"https://cdn.shopify.com/s/files/1/0250/1384/6115/products/golden-reinette-apple-tree_800x.JPG?v=1565650598",
"https://blogchef.net/wp-content/uploads/2022/04/golden-delicious-juicy-ripe-fresh-yellow-apples-brown-wooden-background-side-view-scaled.jpg",
# ...
]
There's a Scrape and download Google Images with Python blog post if you need a little bit more code explanation.
Disclaimer, I work for SerpApi.
Upvotes: 0
Reputation: 10314
google API block you to maximum 20 images
see here for more details
Upvotes: 1