Reputation: 13
I'm trying to write to the log in the __init__
method of my spider, but I can't seem to get it working despite it working fine from the parse method.
The call to self.log within the init method is made by the method 'get_urls_from_file'. I know the method is being called because I see the print statement in the stdout, so I was wondering if anyone could point me in the right direction. I am using scrapy v0.18. Thanks!
My code is below:
from scrapy.spider import BaseSpider
from scrapy_redis import connection
from importlib import import_module
from scrapy import log
from scrapy.settings import CrawlerSettings
class StressS(BaseSpider):
name = 'stress_s_spider'
allowed_domains = ['www.example.com']
def __init__(self, url_file=None, *args, **kwargs):
super(StressS, self).__init__(*args, **kwargs)
settings = CrawlerSettings(import_module('stress_test.settings'))
if url_file:
self.url_file = url_file
else:
self.url_file = settings.get('URL_FILE')
self.start_urls = self.get_urls_from_file(self.url_file)
self.server = connection.from_settings(settings)
self.count_key = settings.get('ITEM_COUNT')
def parse(self, response):
self.log('Processed: %s, status code: %s' % (response.url, response.status), level = log.INFO)
self.server.incr(self.count_key)
def get_urls_from_file(self, fn):
urls = []
if fn:
try:
with open(fn, 'r') as f:
urls = [line.strip() for line in f.readlines()]
except IOError:
msg = 'File %s could not be opened' % fn
print msg
self.log(msg, level = log.ERROR)
return urls
Upvotes: 1
Views: 797
Reputation: 6710
You can override the start_requests
method:
# Default value for the argument in case it's missing.
url_file = None
def start_requests(self):
settings = self.crawler.settings
url_file = self.url_file if self.url_file else settings['URL_FILE']
# set up server and count_key ...
# finally yield the requests
for url in self.get_urls_from_file(url_file):
yield Request(url, dont_filter=True)
Also you could override the method set_crawler
and set up there the attributes:
def set_crawler(self, crawler):
super(MySpider, self).set_crawler(crawler)
settings = crawler.settings
# set up start_urls ...
Upvotes: 2