Reputation: 1950
I am running the scrapy spider from external file as per the example in scrapy docs. I want to grab the stats provided by the Core API and store it to mysql table after the crawl is finished.
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from test.spiders.myspider import *
from scrapy.utils.project import get_project_settings
from test.pipelines import MySQLStorePipeline
import datetime
spider = MySpider()
def run_spider(spider):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
mysql_insert = MySQLStorePipeline()
mysql_insert.cursor.execute(
'insert into crawler_stats(sites_id, start_time,end_time,page_scraped,finish_reason)
values(%s,%s,%s, %s,%s)',
(1,datetime.datetime.now(),datetime.datetime.now(),100,'test'))
mysql_insert.conn.commit()
run_spider(spider)
How can I get the values of stats like start_time, end_time, pages_scraped, finish_reason in the above code?
Upvotes: 2
Views: 4463
Reputation: 473793
Get them from the crawler.stats
collector:
stats = crawler.stats.get_stats()
Example code (collecting stats in the spider_closed
signal handler):
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # stats is a dictionary
# write stats to the database here
reactor.stop()
def run_spider(spider):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(callback, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
run_spider(spider)
Upvotes: 7