Abhishek Rai
Abhishek Rai

Reputation: 2227

selenium multiple window scrape. Python

Have been trying many things for the last few hours on this. However, strangely when selenium opens the new windows, it switches to it but instead of scraping data from the new page, if keeps closing the previous windows and scrapes data from them. Also, it keeps opening all the links instead of the next one. Sort of hit a wall with this one. Any help is appreciated. Thanks.

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
actions = ActionChains(driver)
search_term = input("Enter your search term :")
url = f'https://www.sciencedirect.com/search?qs={search_term}&years=2021%2C2020%2C2019&lastSelectedFacet=years'
driver.get(url)
driver.maximize_window()

WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[3]/div/div/div/button/span'))).click()
divs = driver.find_elements_by_class_name('result-item-content')
links = []
for div in divs:
    link = div.find_element_by_tag_name('a')
    links.append(link)


def get_data():
    actions.key_down(Keys.CONTROL).click(link).key_up(Keys.CONTROL).perform()
    par_guid = driver.current_window_handle
    allguid = driver.window_handles
    for guid in allguid:
        if guid != par_guid:
            driver.switch_to.window(guid)
            break
    author_group = driver.find_element_by_id('author-group')
    for author in author_group.find_elements_by_css_selector("a.author"):
        try:
            given_name = author.find_element_by_css_selector(".given-name").text
            surname = author.find_element_by_css_selector(".surname").text

        except NoSuchElementException:
            print("Could not extract first or last name")
            continue

        try:
            mail_icon = author.find_element_by_css_selector(".icon-envelope")
            mail_icon.click()
            mail_icon_present = True
            mail = driver.find_element_by_class_name('e-address')
            print(mail.text)

        except NoSuchElementException:
            mail_icon_present = False
            print(f"Author {given_name} {surname}. Mail icon present: {mail_icon_present}")
    driver.close()
    driver.switch_to.window(par_guid)

for link in links:
    get_data()

Upvotes: 0

Views: 518

Answers (1)

Abhishek Rai
Abhishek Rai

Reputation: 2227

The problem part was this line actions.key_down(Keys.CONTROL).click(link).key_up(Keys.CONTROL).perform() I think this is not the right way to open a new tab if you are dealing with multiple links. I changed it to driver.execute_script('window.open(arguments[0]);', link) . I also let go of saving the links to a list. This works because there are only two windows open at a time. The home window and the link we clicked on. So, In entirety the code becomes

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
actions = ActionChains(driver)
search_term = input("Enter your search term :")
url = f'https://www.sciencedirect.com/search?qs={search_term}&years=2021%2C2020%2C2019&lastSelectedFacet=years'
driver.get(url)
driver.maximize_window()

WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[3]/div/div/div/button/span'))).click()
divs = driver.find_elements_by_class_name('result-item-content')
for div in divs:
    parent_window = driver.current_window_handle
    link = div.find_element_by_tag_name('a')
    driver.execute_script('window.open(arguments[0]);', link)
    all_windows = driver.window_handles
    child_window = [window for window in all_windows if window != parent_window][0]
    driver.switch_to.window(child_window)
    title = driver.find_element_by_tag_name('h1')
    print("Article Title:- ",title.text)
    author_group = driver.find_element_by_id('author-group')
    for author in author_group.find_elements_by_css_selector("a.author"):
        try:
            given_name = author.find_element_by_css_selector(".given-name").text
            surname = author.find_element_by_css_selector(".surname").text

        except NoSuchElementException:
            print("Could not extract first or last name")
            continue

        try:
            mail_icon = author.find_element_by_css_selector(".icon-envelope")
            mail_icon.click()

            mail = driver.find_element_by_class_name('e-address')
            print(mail.text)

        except NoSuchElementException:
            print(f"Author {given_name} {surname}")
    driver.close()
    driver.switch_to.window(parent_window)

Upvotes: 1

Related Questions