Anton Gomes
Anton Gomes

Reputation: 23

Parsing request urls Scrapy

I am conducting a broad crawl where each url which is scraped is piped to a database, or if it already exists its fields are updated (title, last_scraped etc.).

I only want to scrape clean urls so I've implemented a check in the parse method of the spider but some urls are slipping through.

from urllib.parse import urlparse

# Retrieve links from current page
links = LinkExtractor(unique=True).extract_links(response) 

# Follow links
for link in links:
    if not urlparse(link.url).fragment and not urlparse(link.url).query and urlparse(link.url).scheme in {'http', 'https'}:
        yield scrapy.Request(link.url, callback=self.parse, errback=self.parse_errback)
    else:
        self.logger.error(f"Bad url: {link.url}")

Some examples of urls which have still made it to the database are listed below:

Each of these urls fail the check when tested individually.

If anyone can point out why this might be happening it would be super appreciated.

Other relevent files:

pipeline.py is setup as below:

class MySqlPipeline(object):

    def __init__(self):
        self.conn = mysql.connector.connect(
            database = os.environ.get('MYSQL_DATABASE'),
            password = os.environ.get('MYSQL_ROOT_PASSWORD'),
            user = 'root',
            host = 'db',
            port = 3306,
        )

        # Create cursor, used to exectue SQL statements
        self.cur = self.conn.cursor(buffered=True)

    def process_item(self, item, spider):
        item = EdwebCrawlerItem(**item)
        if item['url'] =='https://www.ed.ac.uk/':
            return
        
        # Check if URL exists
        self.cur.execute(f""" SELECT id FROM {os.environ.get('MYSQL_TABLE')} WHERE url = '{item['url']}' """ )
        exists = self.cur.fetchone()

        # UPDATE or INSERT record as appropriate
        if exists:
            self.cur.execute(f"""UPDATE {os.environ.get('MYSQL_TABLE')} SET 
                title = %s,
                page_type = %s,
                last_modified = %s,
                sub_pages = %s,
                last_scraped = %s
                WHERE url = %s""", (
                item['title'],
                item['page_type'],
                item['last_modified'],
                item['sub_pages'],
                datetime.datetime.today(),
                item['url']
            ))
            spider.logger.error(f"Updated: {item['url']}")
        else:
            self.cur.execute(f""" INSERT INTO {os.environ.get('MYSQL_TABLE')} (url, title, page_type, last_modified, sub_pages, last_scraped) VALUES (%s, %s, %s, %s, %s, %s) """, (
                item['url'],
                item['title'],
                item['page_type'],
                item['last_modified'],
                item['sub_pages'],
                datetime.datetime.today()
            ))
            spider.logger.error(f"inserted: {item['url']}")

        # Commit sql
        self.conn.commit()

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()

The only changes to themiddleware is overiding the retry and rpobotstxt middlewares simply to clean up the logs:

class CustomRetryMiddleware(RetryMiddleware):
    EXCEPTIONS_TO_RETRY = (
        ConnectError,
        ConnectionDone,
        ConnectionLost,
        TimeoutError, 
        TCPTimedOutError, 
        ConnectionRefusedError
        )

class CustomRobotsTxtMiddleware(RobotsTxtMiddleware):

    EXCEPTIONS_TO_LOG = ( 
        ConnectError,
        ConnectionDone,
        ConnectionLost,
        TimeoutError, 
        TCPTimedOutError, 
        ConnectionRefusedError
        )

    def _logerror(self, failure, request, spider):
        if failure.type in self.EXCEPTIONS_TO_LOG:
            logger.warning(f"Error downloading robots.txt: {request} {failure.value}"
            )
        return failure

Upvotes: 0

Views: 33

Answers (0)

Related Questions