rish_hyun
rish_hyun

Reputation: 481

Scrapy signals not connecting to class methods

I've defined a Crawler class for crawling multiple spiders from script.
For spiders, instead of using pipelines, I defined a class, CrawlerPipeline and used signals for connecting methods.
In CrawlerPipeline, some methods require to use class variables such as __ERRORS.
I'm unable to implement the correct way for the same. Any suggestions or ideas will be very helpful.
For reference, I'm attaching the code snippet

from scrapy import signals
from scrapy.crawler import CrawlerProcess
from .pipeline import CrawlerPipeline


class Crawler:

    def __init__(self) -> None:
        self.process = CrawlerProcess(settings={
            'ROBOTSTXT_OBEY': False,
            'REDIRECT_ENABLED': True,
            'SPIDER_MODULES': ['engine.crawler.spiders'],
            'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
            'USER_AGENT': 'Mozilla/5.0 (Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
        })

    def spawn(self, spider: str, **kwargs) -> None:
        self.process.crawl(spider, **kwargs)
        self.__connect_signals(spider)

    def run(self) -> None:
        self.process.start()

    def __connect_signals(self, spider: str) -> None:
        pipe = CrawlerPipeline()

        for crawler in self.process.crawlers:
            _set_signal = crawler.signals.connect

            if spider == 'a':
                _set_signal(pipe.add_meta_urls, signal=signals.spider_opened)

            if spider == 'b':
                ...

            if spider == 'c':
                _set_signal(pipe.add_meta_urls, signal=signals.spider_opened)

            if spider == 'd':
                ...
            
            # These lines are not working, above two also not working
            _set_signal(pipe.process_item, signal=signals.item_scraped)
            _set_signal(pipe.spider_closed, signal=signals.spider_closed)
            _set_signal(pipe.spider_error, signal=signals.spider_error)
import json
from pathlib import Path
from collections import defaultdict

from api.database import Mongo


class CrawlerPipeline:

    __ITEMS = defaultdict(list)
    __ERRORS = list

    def process_item(self, item, spider):
        self.__ITEMS[spider.name].append(item)
        return item

    def add_meta_urls(self, spider):
        spider.start_urls = ['https://www.example.com']

    def spider_error(self, failure, response, spider):
        self.__ERRORS.append({
            'spider': spider.name,
            'url': response.url,
            'status': response.status,
            'error': failure.getErrorMessage(),
            'traceback': failure.getTraceback(),
        })

    def spider_closed(self, spider, reason):
        print(self.__ERRORS)
        Path("logs").mkdir(parents=True, exist_ok=True)
        ...

Upvotes: 0

Views: 127

Answers (0)

Related Questions