agreüs
agreüs

Reputation: 109

How to get a 100 Images from google images

I'm trying to scrape Google to get some pictures (like a 100 maybe)from a list (list_name). But my code returns only 20 and I can't figure why.

Here is my code:

import os
import requests
from bs4 import BeautifulSoup

liste_name = ['blood orange','apple golden']

for name in liste_name:
name_splited = name.split(" ")
if len(name_splited) > 1:
    full_name = name_splited[0] + "_" + name_splited[1]
    path = "./Dataset/Trainset/" + full_name + "/"
    name = name_splited[0] + "%" + name_splited[1]


url = "https://www.google.ch/search?site=webhp&tbm=isch&source=hp&q=" + \
       name + "&oq=" + name + "biw=1280&bih=579&num=100"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

list_res_image = soup.find_all("img")

if not os.path.exists(path):
    os.makedirs(path)
for index, lien in enumerate(list_res_image):
    link = lien['src']
    test = False
    
    while not test:
        try:
            img = requests.get(link).content
            test = True
        except requests.exceptions.SSLError:
            pass
    with open(path + full_name + str(index) + ".png", "wb") as f:
        f.write(img)

Upvotes: 0

Views: 660

Answers (2)

Denis Skopa
Denis Skopa

Reputation: 99

You can use selenium or playwright to get absolutely all images. Also, you can use the "ijn" URL parameter that defines the page number to get e.g 0 is the first page, 1 is the second, and so on. The parameter should be greater than or equal to 0.

However, we can also do this with BeautifulSoup, using regular expressions, from inline JSON.

In order not to make a request for a specific link, you can set parameters that can always be changed for subsequent search:

# this URL params is taken from the actual Google search URL
# and transformed to a more readable format
params = {
    "q": "blood orange",              # search query
    "tbm": "isch",                    # image results
    "hl": "en",                       # language of the search
    "gl": "us",                       # country where search comes from
  }

With the help of regular expressions, we gradually filter out the inline JSON code to find images results:

# https://regex101.com/r/kyLU8S/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
  
# https://regex101.com/r/GbVLOq/1
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

# https://regex101.com/r/LzhCYM/1
matched_google_images_thumbnails = ", ".join(
    re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(", ")

thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]

# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))

# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

Full code and example in the online IDE

import requests, re, json, lxml
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
  }

queries = ['blood orange','apple golden']
google_images = []
for query in queries:
    print(f'Extracting images for query: {query}')
    
    params = {    
        "q": query,              # search query
        "tbm": "isch",           # image results
        "hl": "en",              # language of the search
        "gl": "us",              # country where search comes fro
      }
    
    html = requests.get("https://google.com/search", params=params, headers=headers, timeout=30)
    soup = BeautifulSoup(html.text, "lxml")
    
    all_script_tags = soup.select("script")
    
    # https://regex101.com/r/48UZhY/4
    matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)
      
    # https://regex101.com/r/VPz7f2/1
    matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)
    
    # https://regex101.com/r/Jt5BJW/1
    matched_google_images_thumbnails = ", ".join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                       str(matched_google_image_data))).split(", ")
    
    thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
    
    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
            r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
    
    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
    
    full_res_images = [
            bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
    ]
        
    for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
        google_images.append({
            "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
            "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
            "source": metadata.select_one(".fxgdke").text,
            "thumbnail": thumbnail,
            "original": original
        })

print(json.dumps(google_images, indent=2, ensure_ascii=False))

Example output

[
  
   {
    "title": "Glazed Blood Orange and Lavender Loaf | Olive & Mango",
    "link": "https://www.oliveandmango.com/glazed-blood-orange-and-lavender-loaf",
    "source": "oliveandmango.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTT_92Ydz2t-niZ8bF7tExYqVSYzeLldzXQjg&usqp=CAU",
    "original": "https://d33wubrfki0l68.cloudfront.net/09a0f8357a7f0d667b7b20537b74886649cc35cc/9bb85/images/uploads/2019_02_09_glazed_blood_orange_and_lavender_loaf_3.jpg"
  },
  {
    "title": "Blood Orange Gin & Tonic – A Couple Cooks",
    "link": "https://www.acouplecooks.com/blood-orange-cocktail/",
    "source": "acouplecooks.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcScgVolM0b-ilf63TlcTSJTSpkV_3HX9iQh5Q&usqp=CAU",
    "original": "https://www.acouplecooks.com/wp-content/uploads/2021/01/Blood-Orange-Cocktail-001.jpg"
  },
  {
    "title": "Fresh Golden Delicious Apples - Shop Fruit at H-E-B",
    "link": "https://www.heb.com/product-detail/fresh-golden-delicious-apples/377503",
    "source": "heb.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT1y_FmZ56YcN6NeVyzT-TKHh54HgtByvSSFpcxVIBRjYB-l9HDaE_rMDrmKlI6IcvfTZs&usqp=CAU",
    "original": "https://images.heb.com/is/image/HEBGrocery/000377503"
  },
  {
    "title": "Golden Delicious - Wikipedia",
    "link": "https://en.wikipedia.org/wiki/Golden_Delicious",
    "source": "en.wikipedia.org",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSdSLrBTzdhPzJp-AbZftn8iTm-6OR_PFLSmqJqiZyfjsPGMB6lryZdb8tF3rYiwxmTJC0&usqp=CAU",
    "original": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Golden_Delicious_apples.jpg/1200px-Golden_Delicious_apples.jpg"
  },
  # ...
]

Also you can use Google Images API from SerpApi. It's a paid API with the free plan. The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.

Example to integrate:

from serpapi import GoogleSearch
import os, json

image_results = []
   
queries = ['blood orange','apple golden']
for query in queries:
    print(f'extracting images for query: {query}')
# search query     parameters
    params = {
        "engine": "google",               # search engine. Google, Bing, Yahoo, Naver, Baidu...
        "q": query,                       # search query
        "tbm": "isch",                    # image results
        "num": "100",                     # number of images per page
        "ijn": 0,                         # page number: 0 -> first page, 1 -> second...
        "api_key": os.getenv("API_KEY")   # your serpapi api key
                                          # other query parameters: hl (lang), gl (country), etc  
    }
        
    search = GoogleSearch(params)         # where data extraction happens
        
    images_is_present = True
    while images_is_present:
        results = search.get_dict()       # JSON -> Python dictionary
        
    # checks for "Google hasn't returned any results for this query."
        if "error" not in results:
            for image in results["images_results"]:
                if image["original"] not in image_results:
                        image_results.append(image["original"])
                    
    # update to the next page
            params["ijn"] += 1
        else:
            print(results["error"])
            images_is_present = False

print(json.dumps(image_results, indent=2))

Output:

[
  "https://www.researchgate.net/publication/340952507/figure/fig1/AS:885003558846464@1588012699713/Apple-varieties-Red-Delicious-Granny-Smith-Golden-Delicious-respectively-Sekil-1.jpg",
  "https://goodfruitguide.co.uk/wp-content/uploads/Apple-Golden-Delicious-ZA-DSC_0021-cr-sq-300x300.jpg",
  "http://newenglandapples.files.wordpress.com/2011/12/img_6239.jpg",
  "https://i5.peapod.com/c/IY/IY47G.png",
  "https://cdn.shopify.com/s/files/1/1251/5173/products/goldendelicious_1024x1024.jpeg?v=1572074514",
  "https://www.gannett-cdn.com/-mm-/a5076e7a43a0cec6129489319d0fb728e2cd1814/c=0-264-5184-3193/local/-/media/2018/01/03/Phoenix/Phoenix/636505888078540454-opal-apples-8.JPG?width=660&height=373&fit=crop&format=pjpg&auto=webp",
  "https://cdn.shopify.com/s/files/1/0250/1384/6115/products/golden-reinette-apple-tree_800x.JPG?v=1565650598",
  "https://blogchef.net/wp-content/uploads/2022/04/golden-delicious-juicy-ripe-fresh-yellow-apples-brown-wooden-background-side-view-scaled.jpg",
  # ...
]

There's a Scrape and download Google Images with Python blog post if you need a little bit more code explanation.

Disclaimer, I work for SerpApi.

Upvotes: 0

Tal Avissar
Tal Avissar

Reputation: 10314

google API block you to maximum 20 images

see here for more details

Upvotes: 1

Related Questions