John M.
John M.

Reputation: 2712

Unable to crawl when start_requests is overrided

I'm trying to attach the start URL as metadata to the requests by overriding start_requests, but the spider appears to refuse crawling other pages beside the start URLs. Does anyone know how to have metadata in the requests and crawl outside of the start URLs?

Thank you

class TSpider(CrawlSpider):
    name = 't'
    allowed_domains = ['books.toscrapes.com']
    start_urls = ['https://books.toscrapes.com']

    rules = (
        Rule(LinkExtractor(allow=[r'.*page.*']), callback='parse_item', follow=True),
    )

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=parse_item, meta={'start_url': url})


    def parse_item(self, response):
        item = {}
        item['title'] = response.xpath('//head/title/text()').extract()
        item['url'] = response.url
        item['start_url'] = response.meta['start_url']
        yield item

Upvotes: 0

Views: 38

Answers (1)

SuperUser
SuperUser

Reputation: 4822

Your problem is the callback in start_requests method, remove it.

If you want to add the start url to every request you can do one of the following:

Method 1: use process_request (better than method 2).

from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


def process_request(request, response):
    request.meta['start_url'] = response.request.meta.get('start_url')
    return request


class TSpider(CrawlSpider):
    name = 't'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['https://books.toscrape.com']

    rules = (
        Rule(LinkExtractor(allow=[r'.*page.*']), callback='parse_item', follow=True, process_request=process_request),
    )

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, meta={'start_url': url})

    def parse_item(self, response):
        item = dict()
        item['title'] = response.xpath('//head/title/text()').extract()
        item['url'] = response.url
        item['start_url'] = response.request.meta.get('start_url')
        yield item

Method 2: overwrite _requests_to_follow method.

from scrapy import Request
from scrapy.http import HtmlResponse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class TSpider(CrawlSpider):
    name = 't'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['https://books.toscrape.com']

    rules = (
        Rule(LinkExtractor(allow=[r'.*page.*']), callback='parse_item', follow=True),
    )

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, meta={'start_url': url})

    def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
            return
        seen = set()
        for rule_index, rule in enumerate(self._rules):
            links = [lnk for lnk in rule.link_extractor.extract_links(response)
                     if lnk not in seen]
            for link in rule.process_links(links):
                seen.add(link)
                request = self._build_request(rule_index, link)
                request.meta['start_url'] = response.meta.get('start_url')  # I added just this one line
                yield rule.process_request(request, response)

    def parse_item(self, response):
        item = dict()
        item['title'] = response.xpath('//head/title/text()').extract()
        item['url'] = response.url
        item['start_url'] = response.meta.get('start_url')
        yield item

Upvotes: 1

Related Questions