Reputation: 1
I am attempting to extract Titles, Authors, and PDF Links from the oral presentations tab, spanning pages 1 to 6, on the website https://openreview.net/group?id=ICML.cc/2024/Conference#tab-accept-oral.
While I've successfully managed to extract information from the first page, I'm encountering issues automating the process up to page 6. I understand that the page contents are dynamically displayed through JavaScript, but the code provided by GPT consistently results in StaleElementReferenceException errors.
Here is my code (which was wrong)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
driver.get("https://openreview.net/group?id=ICML.cc/2024/Conference#tab-accept-oral")
try:
for i in range(1, 7): # repeat from 1 page to 6 page
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div#accept-oral div.note"))
)
notes = driver.find_elements(By.CSS_SELECTOR, "div#accept-oral div.note")
for note in notes:
title = note.find_element(By.CSS_SELECTOR, "h4 a").text
authors = note.find_element(By.CSS_SELECTOR, "div.note-authors").text
pdf_link = note.find_element(By.CSS_SELECTOR, "a.pdf-link").get_attribute('href')
print("Title:", title)
print("Authors:", authors)
print("PDF Link:", pdf_link)
# click next page
if i < 6:
next_page_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, f"ul.pagination li:nth-child({i+2}) a"))
)
driver.execute_script("arguments[0].click();", next_page_button)
finally:
driver.quit()
Upvotes: 0
Views: 60