Reputation: 195
I am new to web scraping. I am learning web scraping in Python. I am following the documentation from this link.
I was trying to integrate Selenium into my practice but I was not successful. So, I have attached the code and error with this thread.
import scrapy
from tutorial.selenium.selenium import SeleniumPractice
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
# urls = [
# 'https://quotes.toscrape.com/page/1/',
# ]
# for url in urls:
for url in ['https://quotes.toscrape.com/page/1/']:
#yield scrapy.Request(url=url, callback=self.parse)
yield SeleniumPractice(url=url)
def parse(self, response):
# page = response.url.split("/")[-2]
# filename = f'quotes-{page}.html'
# with open(filename, 'wb') as f:
# f.write(response.body)
# self.log(f'Saved file {filename}')
for quote in response.css('div.quote'):
result = {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
yield result
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from scrapy import Request
class SeleniumPractice(Request):
def __init__(self, url):
self.url=url
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)
def process_request(self, request):
self.driver.get(request.url)
And I got the following error
2022-03-07 17:48:18 [scrapy.core.engine] ERROR: Error while obtaining start requests
Traceback (most recent call last):
File "/Users/st/opt/anaconda3/lib/python3.9/site-packages/scrapy/core/engine.py", line 150, in _next_request
request = next(self.slot.start_requests)
File "/Users/st/Documents/web_scrapping/tutorial/tutorial/spiders/quotes_spider.py", line 15, in start_requests
yield SeleniumPractice(url=url)
File "/Users/st/Documents/web_scrapping/tutorial/tutorial/selenium/selenium.py", line 8, in __init__
self.url=url
File "/Users/st/opt/anaconda3/lib/python3.9/site-packages/scrapy/http/common.py", line 5, in newsetter
raise AttributeError(msg)
AttributeError: SeleniumPractice.url is not modifiable, use SeleniumPractice.replace() instead
2022-03-07 17:48:18 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-07 17:48:18 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'elapsed_time_seconds': 0.00216,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 3, 8, 1, 48, 18, 362735),
'log_count/DEBUG': 1,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'memusage/max': 65429504,
'memusage/startup': 65429504,
'start_time': datetime.datetime(2022, 3, 8, 1, 48, 18, 360575)}
2022-03-07 17:48:18 [scrapy.core.engine] INFO: Spider closed (finished)
Upvotes: 1
Views: 3753
Reputation: 46
1.You can try downloading the corresponding version driver for your browser from this link https://chromedriver.chromium.org/downloads
2.Writing your own selenium middleware¶
3.code example
middleware.py
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleWare(object):
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(options=chrome_options)
def process_request(self, request, spider):
self.driver.get(request.url)
content = self.driver.page_source
self.driver.quit()
return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def process_response(self, request, response, spider):
return response
setting.py
DOWNLOADER_MIDDLEWARES = {
'bid_spiders.middlewares.SeleniumMiddleWare': 491
}
spider.py
import scrapy
class SeleniumSpider(scrapy.Spider):
name = 'test_selenium'
start_urls = ['https://quotes.toscrape.com/page/1/']
def parse(self, response):
for quote in response.css('div.quote'):
result = {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
print(result)
Upvotes: 3