Bad Coder
Bad Coder

Reputation: 195

How to use Selenium with Scrapy?

I am new to web scraping. I am learning web scraping in Python. I am following the documentation from this link.

I was trying to integrate Selenium into my practice but I was not successful. So, I have attached the code and error with this thread.

import scrapy
from tutorial.selenium.selenium import SeleniumPractice


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        # urls = [
        #     'https://quotes.toscrape.com/page/1/',
        # ]
        # for url in urls:
        for url in ['https://quotes.toscrape.com/page/1/']:
            #yield scrapy.Request(url=url, callback=self.parse)
            yield SeleniumPractice(url=url)

    def parse(self, response):
        # page = response.url.split("/")[-2]
        # filename = f'quotes-{page}.html'
        # with open(filename, 'wb') as f:
        #     f.write(response.body)
        # self.log(f'Saved file {filename}')
        for quote in response.css('div.quote'):
            result = {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall(),
            }
            yield result
        
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from scrapy import Request


class SeleniumPractice(Request):
    def __init__(self, url):
        self.url=url

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)

    def process_request(self, request):
        self.driver.get(request.url)

And I got the following error

2022-03-07 17:48:18 [scrapy.core.engine] ERROR: Error while obtaining start requests
Traceback (most recent call last):
  File "/Users/st/opt/anaconda3/lib/python3.9/site-packages/scrapy/core/engine.py", line 150, in _next_request
    request = next(self.slot.start_requests)
  File "/Users/st/Documents/web_scrapping/tutorial/tutorial/spiders/quotes_spider.py", line 15, in start_requests
    yield SeleniumPractice(url=url)
  File "/Users/st/Documents/web_scrapping/tutorial/tutorial/selenium/selenium.py", line 8, in __init__
    self.url=url
  File "/Users/st/opt/anaconda3/lib/python3.9/site-packages/scrapy/http/common.py", line 5, in newsetter
    raise AttributeError(msg)
AttributeError: SeleniumPractice.url is not modifiable, use SeleniumPractice.replace() instead
2022-03-07 17:48:18 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-07 17:48:18 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'elapsed_time_seconds': 0.00216,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 3, 8, 1, 48, 18, 362735),
 'log_count/DEBUG': 1,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,
 'memusage/max': 65429504,
 'memusage/startup': 65429504,
 'start_time': datetime.datetime(2022, 3, 8, 1, 48, 18, 360575)}
2022-03-07 17:48:18 [scrapy.core.engine] INFO: Spider closed (finished)

Upvotes: 1

Views: 3753

Answers (1)

vic.
vic.

Reputation: 46

1.You can try downloading the corresponding version driver for your browser from this link https://chromedriver.chromium.org/downloads

2.Writing your own selenium middleware¶

3.code example

middleware.py

from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleWare(object):

    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=chrome_options)

    def process_request(self, request, spider):
        self.driver.get(request.url)
        content = self.driver.page_source
        self.driver.quit()

        return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)

    def process_response(self, request, response, spider):
        return response
        

setting.py

DOWNLOADER_MIDDLEWARES = {
    'bid_spiders.middlewares.SeleniumMiddleWare': 491
}

spider.py

import scrapy

class SeleniumSpider(scrapy.Spider):
    name = 'test_selenium'
    start_urls = ['https://quotes.toscrape.com/page/1/']
    def parse(self, response):
        for quote in response.css('div.quote'):
            result = {
            'text': quote.css('span.text::text').get(),
            'author': quote.css('small.author::text').get(),
            'tags': quote.css('div.tags a.tag::text').getall(),
            }
            print(result)

Upvotes: 3

Related Questions