Scrapy and Selenium StaleElementReferenceException

Question

There is several click-able elements on the page and I'm trying to scrape some pages behind, but I have this error and spider closed after first click:

StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up

For now I just trying to get page opened to catch new url. Here is my code

from scrapy import signals
from scrapy.http import TextResponse
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.xlib.pydispatch import dispatcher

from MySpider.items import MyItem

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

import time

class MySpider(Spider):
    name = "myspider"
    allowed_domains = ["http://example.com"]
    base_url = 'http://example.com'
    start_urls = ["http://example.com/Page.aspx",]

    def __init__(self):
        self.driver = webdriver.Firefox()
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        self.driver.close()

    def parse(self, response):

        self.driver.get(response.url)
        item = MyItem()

        links = self.driver.find_elements_by_xpath("//input[@class='GetData']")

        for button in links:
            button.click()
            time.sleep(5)

            source = self.driver.page_source 
            sel = Selector(text=source) # create a Selector object

            item['url'] = self.driver.current_url

            print '

URL
', item['url'], '
'
            yield item

Karl Gong · Accepted Answer

Because the link elements are in the first page. If you open a new page, the link elements are stale.

You can try these two solutions:

1, Store the link url of link elements and use driver.get(url) to open the link.

def parse(self, response):

    self.driver.get(response.url)
    item = MyItem()

    links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
    link_urls = links.get_attribute("href")

    for link_url in link_urls:
        self.driver.get(link_url)
        time.sleep(5)

        source = self.driver.page_source
        sel = Selector(text=source) # create a Selector object

        item['url'] = self.driver.current_url

        print '

URL
', item['url'], '
'
        yield item

2, After click a link and get the url, call driver.back() to back to the first page. Then re-find the link elements.

def parse(self, response):

    self.driver.get(response.url)
    item = MyItem()

    links = self.driver.find_elements_by_xpath("//input[@class='GetData']")

    for i in range(len(links)):
        links[i].click()
        time.sleep(5)

        source = self.driver.page_source
        sel = Selector(text=source) # create a Selector object

        item['url'] = self.driver.current_url

        print '

URL
', item['url'], '
'
        yield item
        self.driver.back()
        links = self.driver.find_elements_by_xpath("//input[@class='GetData']")

Scrapy and Selenium StaleElementReferenceException

Answers (1)

Related Questions