Thomas Haddad
Thomas Haddad

Reputation: 1

How to scrape links off Google images result with selenium, python?

I'm trying to work on a project, and I need to get the links off google image results. Here is my code:

from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse


def google_images(driver, name, intAmount):
    for x in lsDir("gg_downloads"):
        os.remove(os.path.join("gg_downloads", x))

    print(f"[*] Opening Google...")
    driver.get(f"https://www.google.com/search?sclient=img&udm=2&q={name}")
    time.sleep(3)

    print(f"[*] Scrolling down...")
    for _ in tqdm(range(intAmount)):
        ActionChains(driver).scroll_by_amount(0, 10000).perform()
        time.sleep(1)

    print(f"[*] Gathering profile photos...")
    image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
    images = []
    for profile in tqdm(image_elements,desc="Gathering Photos"):
        images.append(profile.get_attribute("src"))
    images = images[::2]

    print(f"[*] Gathering links...")
    links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a") 
    image_links = []
    for link in tqdm(links, desc="Gathering Links"):
        image_links.append(link.get_attribute("href"))
    print(image_links)
    print(f"[*] Downloading profile photos...")
    for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
        try:
            if image_url.startswith('data:'):
                # Handle data URL
                header, encoded = image_url.split(",", 1)
                content_type = header.split(":")[1].split(";")[0]
                file_extension = content_type.split("/")[-1]
                
                # Decode base64 content
                image_data = base64.b64decode(encoded)
                
                # Save the file
                with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
                    f.write(image_data)
            else:
                # Handle regular URL
                parsed_url = urlparse(image_url)
                file_extension = os.path.splitext(parsed_url.path)[1]
                if not file_extension:
                    file_extension = '.jpg'  # Default to .jpg if no extension found
                
                download_file(image_url, f"gg_downloads/{i}{file_extension}")
        except Exception as e:
            print(f"Error downloading image {i}: {str(e)}")

if __name__ == "__main__":
    # Make sure you use pip to install stuff.
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.firefox.service import Service
    from webdriver_manager.firefox import GeckoDriverManager
    #...
    options = Options()
    if settings.headless:
        options.add_argument('-headless')
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(options=options, service=service)
    google_images(driver, "John Smith", 10)

When printing image_links, the array is all None. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:

# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
    options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)

Help would be greatly appreciated! Thank you!

Upvotes: -1

Views: 63

Answers (1)

yamskii-k
yamskii-k

Reputation: 46

I have found the behavior of the page that causes no result for scraping links.
On this page, anchor elements are loaded without any attributes. Once mouse touches an element, 'href' attribute is added to the element.

The following test code can get both links and 'src's.
#1 After moving mouse to a card element, get 'href' attribute.
#2 When reached to a card out of the view area, scroll the view up.
#3 Since continual scrolling-up also refreshes 'src's of img elements, precedent whole page scrolling is not necessary.
#4 When the access reaches near the last card, new 99(if exists) cards are loaded. If the number of them is increased we can continue.
#5 For the test convenience, iteration can be limited.

I hope it helps.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep

def link(cards,begin):

    for i,card in enumerate(cards[begin:]):
        print('\n{:0=3}'.format(begin+i+1))
        
        for j in range(3):
            try:
                ActionChains(driver).move_to_element(card).perform()#1
            except:
                print('== scroll ==')
                ActionChains(driver).scroll_by_amount(0, 800).perform()#2
                sleep(2)
            else:
                break
        sleep(1)
        ancs= card.find_elements(By.XPATH, ".//a")
        url= ancs[0].get_attribute('href')#1
        if url == None:# when element on the view border
            ActionChains(driver).scroll_by_amount(0, 800).perform()#2
            sleep(2)
            ActionChains(driver).move_to_element(card).perform()
            print('== scrolled because of "url=None" ==')
            sleep(1)
            ancs= card.find_elements(By.XPATH, ".//a")
            url= ancs[0].get_attribute('href')
        if i % 2 ==0:# link check sampling
            driver2.get(url)
        source = ancs[1].find_element(By.CSS_SELECTOR, ".ptes9bspan").get_attribute('innerHTML')
        title = ancs[1].find_element(By.CSS_SELECTOR, 'div[class^="toI8R"]').get_attribute('innerHTML')
        print('source:' ,source)
        print('title:' ,title)
        print('url: ',url)
        img_el  = card.find_elements(By.XPATH, ".//img")[0]
        src= img_el.get_attribute('src')#3
        print('\nsrc: ',src)

if __name__ == "__main__":

    driver = webdriver.Firefox()
    driver.get('https://www.google.com/search?sclient=img&udm=2&q=John%20Smith')
    sleep(5)
    driver2 = webdriver.Firefox()# for link check

    cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
    cardsnum = len(cards)
    pre_len = 0
    begin = 0
    end = cardsnum
    i = 0
    while cardsnum > pre_len:#4
        if i==0:
            print('\n\n==== Initialy loaded cards {} ===\n\n'.format(cardsnum - pre_len))
        else:
            print('\n\n==== Newly loaded cards {} ===\n\n'.format(cardsnum - pre_len))
        pre_len = cardsnum
        link(cards,begin) 
        cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
        cardsnum = len(cards)
        begin = end
        end = cardsnum
        i += 1
        if i > 2:#5
            break  

Upvotes: 0

Related Questions