Scrapy spider middleware

Question

I have a function (check_duplicates()) in the spider that checks for the presence of a url in my database, and in case of absence passes the url on to the parse_product method:

def check_duplicates(url):
    connection = mysql.connector.connect(
        host='host_ip',
        port=3306,
        user='username',
        password='pass',
        database='base_name',
    )
    cursor = connection.cursor()
    sqlq = f"SELECT url FROM my_table WHERE url = '{url}'"
    cursor.execute(sqlq)
    results = cursor.fetchone()
    return results


class CianSpider(scrapy.Spider):
    name = 'spider_name'

    def start_requests(self):
        url = 'https://some_site.ru'
        yield Request(
            url=url,
            method='GET',
            callback=self.parse)

    def parse(self, response, **cb_kwargs):
        for item in response.css('a[href*=".item"]::attr("href")').extract():
            url = response.urljoin(item)
            if check_duplicates(url) is None:
                yield scrapy.Request(
                    url=url,
                    cookies=self.cookies,
                    callback=self.parse_product,
                )

    def parse_product(self, response, **cb_kwargs):
        pass

How do I implement this mechanism using Scrapy spider middleware (how and where should I register the url verification function)?

Alexander · Accepted Answer

You can use a custom DownloadMiddleware that anlyzes the requests as they come in and check the request's url.

In your middlewares.py file:

from scrapy.exceptions import IgnoreRequest
import mysql


class YourProjectNameDownloaderMiddleware:

    def process_request(self, request, spider):
        url = request.url
        connection = mysql.connector.connect(
            host='host_ip',
            port=3306,
            user='username',
            password='pass',
            database='base_name',
        )
        cursor = connection.cursor()
        sqlq = f"SELECT url FROM my_table WHERE url = '{url}'"
        cursor.execute(sqlq)
        if not cursor.fetchone():
            return request
        raise IgnoreRequest

Then in your settings.py file:

DOWNLOADER_MIDDLEWARES = {
    'YourProjectName.middlewares.YourProjectNameDownloaderMiddleware': 100,
}

You will need to input your actual project name everywhere where it has MyProjectName.

Scrapy spider middleware

Answers (1)

Related Questions