With Scrapy, How do I check links on a single page is allowed from robots.txt file?

Question

With Scrapy, I will scrape a single page (via script and not from console) to check all the links on this page if they are allowed by the robots.txt file.

In the scrapy.robotstxt.RobotParser abstract base class, I found the method allowed(url, user_agent), but I don't see how to use it.

import scrapy

class TestSpider(scrapy.Spider):
    name = "TestSpider"

    def __init__(self):
        super(TestSpider, self).__init__()
               
    def start_requests(self):
        yield scrapy.Request(url='http://httpbin.org/', callback=self.parse)

    def parse(self, response):
        if 200 <= response.status < 300:
            links = scrapy.linkextractors.LinkExtractor.extract_links(response)
            for idx, link in enumerate(links):
                    # How can I check each link is allowed by robots.txt file?
                    # => allowed(link.url , '*')    
                    
                    # self.crawler.engine.downloader.middleware.middlewares
                    # self.crawler AttributeError: 'TestSpider' object has no attribute 'crawler'

To run 'TestSpider' spider, in settings.py set

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

Go to the project’s top level directory and run:

scrapy crawl TestSpider

Appreciate any help.

My solution:

import scrapy
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
from scrapy.utils.httpobj import urlparse_cached
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class TestSpider(CrawlSpider):
    name = "TestSpider"

    def __init__(self):
        super(TestSpider, self).__init__()
        self.le = LinkExtractor(unique=True, allow_domains=self.allowed_domains)
        self._rules = [
            Rule(self.le, callback=self.parse)
        ]

    def start_requests(self):
        self._robotstxt_middleware = None
        for middleware in self.crawler.engine.downloader.middleware.middlewares:
            if isinstance(middleware, RobotsTxtMiddleware):
                self._robotstxt_middleware = middleware
break

        yield scrapy.Request(url='http://httpbin.org/', callback=self.parse_robotstxt)

    def parse_robotstxt(self, response):
        robotstxt_middleware = None
        for middleware in self.crawler.engine.downloader.middleware.middlewares:
            if isinstance(middleware, RobotsTxtMiddleware):
                robotstxt_middleware = middleware
                break

        url = urlparse_cached(response)
        netloc = url.netloc
        self._robotsTxtParser = None
        if robotstxt_middleware and netloc in robotstxt_middleware._parsers:
                self._robotsTxtParser = robotstxt_middleware._parsers[netloc]
       
        return self.parse(response)

    def parse(self, response):
        if 200 <= response.status < 300:
            links = self.le.extract_links(response)
            for idx, link in enumerate(links):
                # Check if link target is forbidden by robots.txt
                if self._robotsTxtParser:
                    if not self._robotsTxtParser.allowed(link.url, "*"):
                        print(link.url,' Disallow by robotstxt file')

With Scrapy, How do I check links on a single page is allowed from robots.txt file?

Answers (1)

Related Questions