Reputation: 2712
I'm trying to attach the start URL as metadata to the requests by overriding start_requests
, but the spider appears to refuse crawling other pages beside the start URLs. Does anyone know how to have metadata in the requests and crawl outside of the start URLs?
Thank you
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrapes.com']
start_urls = ['https://books.toscrapes.com']
rules = (
Rule(LinkExtractor(allow=[r'.*page.*']), callback='parse_item', follow=True),
)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=parse_item, meta={'start_url': url})
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
item['start_url'] = response.meta['start_url']
yield item
Upvotes: 0
Views: 38
Reputation: 4822
Your problem is the callback in start_requests
method, remove it.
If you want to add the start url to every request you can do one of the following:
Method 1: use process_request (better than method 2).
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
def process_request(request, response):
request.meta['start_url'] = response.request.meta.get('start_url')
return request
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrape.com']
start_urls = ['https://books.toscrape.com']
rules = (
Rule(LinkExtractor(allow=[r'.*page.*']), callback='parse_item', follow=True, process_request=process_request),
)
def start_requests(self):
for url in self.start_urls:
yield Request(url, meta={'start_url': url})
def parse_item(self, response):
item = dict()
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
item['start_url'] = response.request.meta.get('start_url')
yield item
Method 2: overwrite _requests_to_follow method.
from scrapy import Request
from scrapy.http import HtmlResponse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrape.com']
start_urls = ['https://books.toscrape.com']
rules = (
Rule(LinkExtractor(allow=[r'.*page.*']), callback='parse_item', follow=True),
)
def start_requests(self):
for url in self.start_urls:
yield Request(url, meta={'start_url': url})
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for rule_index, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
for link in rule.process_links(links):
seen.add(link)
request = self._build_request(rule_index, link)
request.meta['start_url'] = response.meta.get('start_url') # I added just this one line
yield rule.process_request(request, response)
def parse_item(self, response):
item = dict()
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
item['start_url'] = response.meta.get('start_url')
yield item
Upvotes: 1