Reputation: 23
I am conducting a broad crawl where each url which is scraped is piped to a database, or if it already exists its fields are updated (title, last_scraped etc.).
I only want to scrape clean urls so I've implemented a check in the parse
method of the spider but some urls are slipping through.
from urllib.parse import urlparse
# Retrieve links from current page
links = LinkExtractor(unique=True).extract_links(response)
# Follow links
for link in links:
if not urlparse(link.url).fragment and not urlparse(link.url).query and urlparse(link.url).scheme in {'http', 'https'}:
yield scrapy.Request(link.url, callback=self.parse, errback=self.parse_errback)
else:
self.logger.error(f"Bad url: {link.url}")
Some examples of urls which have still made it to the database are listed below:
Each of these urls fail the check when tested individually.
If anyone can point out why this might be happening it would be super appreciated.
Other relevent files:
pipeline.py
is setup as below:
class MySqlPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
database = os.environ.get('MYSQL_DATABASE'),
password = os.environ.get('MYSQL_ROOT_PASSWORD'),
user = 'root',
host = 'db',
port = 3306,
)
# Create cursor, used to exectue SQL statements
self.cur = self.conn.cursor(buffered=True)
def process_item(self, item, spider):
item = EdwebCrawlerItem(**item)
if item['url'] =='https://www.ed.ac.uk/':
return
# Check if URL exists
self.cur.execute(f""" SELECT id FROM {os.environ.get('MYSQL_TABLE')} WHERE url = '{item['url']}' """ )
exists = self.cur.fetchone()
# UPDATE or INSERT record as appropriate
if exists:
self.cur.execute(f"""UPDATE {os.environ.get('MYSQL_TABLE')} SET
title = %s,
page_type = %s,
last_modified = %s,
sub_pages = %s,
last_scraped = %s
WHERE url = %s""", (
item['title'],
item['page_type'],
item['last_modified'],
item['sub_pages'],
datetime.datetime.today(),
item['url']
))
spider.logger.error(f"Updated: {item['url']}")
else:
self.cur.execute(f""" INSERT INTO {os.environ.get('MYSQL_TABLE')} (url, title, page_type, last_modified, sub_pages, last_scraped) VALUES (%s, %s, %s, %s, %s, %s) """, (
item['url'],
item['title'],
item['page_type'],
item['last_modified'],
item['sub_pages'],
datetime.datetime.today()
))
spider.logger.error(f"inserted: {item['url']}")
# Commit sql
self.conn.commit()
def close_spider(self, spider):
self.cur.close()
self.conn.close()
The only changes to themiddleware is overiding the retry and rpobotstxt middlewares simply to clean up the logs:
class CustomRetryMiddleware(RetryMiddleware):
EXCEPTIONS_TO_RETRY = (
ConnectError,
ConnectionDone,
ConnectionLost,
TimeoutError,
TCPTimedOutError,
ConnectionRefusedError
)
class CustomRobotsTxtMiddleware(RobotsTxtMiddleware):
EXCEPTIONS_TO_LOG = (
ConnectError,
ConnectionDone,
ConnectionLost,
TimeoutError,
TCPTimedOutError,
ConnectionRefusedError
)
def _logerror(self, failure, request, spider):
if failure.type in self.EXCEPTIONS_TO_LOG:
logger.warning(f"Error downloading robots.txt: {request} {failure.value}"
)
return failure
Upvotes: 0
Views: 33