Reputation: 481
I've defined a Crawler
class for crawling multiple spiders from script.
For spiders, instead of using pipelines, I defined a class, CrawlerPipeline
and used signals for connecting methods.
In CrawlerPipeline
, some methods require to use class variables such as __ERRORS
.
I'm unable to implement the correct way for the same. Any suggestions or ideas will be very helpful.
For reference, I'm attaching the code snippet
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from .pipeline import CrawlerPipeline
class Crawler:
def __init__(self) -> None:
self.process = CrawlerProcess(settings={
'ROBOTSTXT_OBEY': False,
'REDIRECT_ENABLED': True,
'SPIDER_MODULES': ['engine.crawler.spiders'],
'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
'USER_AGENT': 'Mozilla/5.0 (Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
})
def spawn(self, spider: str, **kwargs) -> None:
self.process.crawl(spider, **kwargs)
self.__connect_signals(spider)
def run(self) -> None:
self.process.start()
def __connect_signals(self, spider: str) -> None:
pipe = CrawlerPipeline()
for crawler in self.process.crawlers:
_set_signal = crawler.signals.connect
if spider == 'a':
_set_signal(pipe.add_meta_urls, signal=signals.spider_opened)
if spider == 'b':
...
if spider == 'c':
_set_signal(pipe.add_meta_urls, signal=signals.spider_opened)
if spider == 'd':
...
# These lines are not working, above two also not working
_set_signal(pipe.process_item, signal=signals.item_scraped)
_set_signal(pipe.spider_closed, signal=signals.spider_closed)
_set_signal(pipe.spider_error, signal=signals.spider_error)
import json
from pathlib import Path
from collections import defaultdict
from api.database import Mongo
class CrawlerPipeline:
__ITEMS = defaultdict(list)
__ERRORS = list
def process_item(self, item, spider):
self.__ITEMS[spider.name].append(item)
return item
def add_meta_urls(self, spider):
spider.start_urls = ['https://www.example.com']
def spider_error(self, failure, response, spider):
self.__ERRORS.append({
'spider': spider.name,
'url': response.url,
'status': response.status,
'error': failure.getErrorMessage(),
'traceback': failure.getTraceback(),
})
def spider_closed(self, spider, reason):
print(self.__ERRORS)
Path("logs").mkdir(parents=True, exist_ok=True)
...
Upvotes: 0
Views: 127