Reputation: 45
I'm completely new to Stackoverflow and the world of coding so apologies in advance if this question is formatted poorly!
I've built spiders that crawl websites for key information then write the output to a database. I'd now like to write some key stats from each scrape to a separate table in the database.
I've tried using scrapy Stats Collector but so far haven't been able to get anything written to the database, and I'm struggling to find any examples of this.
Can anyone help in pointing me in the right direction? Below is an example trying to write just one column (total entries that the spider found during a scrape). The crawler is successful, but my statscollector isn't writing anything. Specifically I'd love to know:
1) Am I approaching this in the right way, particularly as I add more spiders and more complexity to the project
2) How do I only write to the database one stat per crawl (e.g. total number of products found). Currently I believe it's trying to write to the database each time parse_main_item is called, which would be dozens of times for dozens of sub categories.
Thanks in advance for your help!
# One of my spiders
class Retailer1(scrapy.Spider):
name = 'R1_monitor'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com']
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
#site specific xpaths
def parse(self, response):
# returns all the category links in the home page menu
yield scrapy.Request(url=url, callback=self.parse_category, )
def parse_category(self, response):
# returns all the sub category links sitting beneath each category
yield scrapy.Request(url, callback=self.parse_page_url, )
def parse_page_url(self, response):
# uses a product count in the html to return a url that will display all products
yield scrapy.Request(paginated_url, callback=self.parse_main_item, dont_filter=True,)
def parse_main_item(self, response):
items_by_id = {}
for product in zip(
#zips things like product name, price etc together ):
item = PriceMonitorItem()
item['product_id'] = product[0]
item['product_name'] = product[1]
item['product_url'] = product[2]
item['product_image'] = product[3]
item['retailer_site'] = host
items_by_id[product[0]] = item
#yield all of the items from the main parse function
for items in items_by_id.values():
yield items
#Here I'd like to count the total number of items that are successfully parsed and sent to the pipeline
self.total_items = 0
self.total_items += len(items_by_id)
total_entries = self.crawler.stats.set_value("Total items scraped", self.total_items, spider=DdcMonitorSpider)
yield total_entries
In pipelines.py
class PriceCrawlerStatsPipeline(object):
def __init__(self):
#db session is created
def process_item(self, total_entries, spider):
session = self.Session()
**self.scrape_stats = ScrapeStats()
self.scrape_stats.total_entries = total_entries**
columns_to_dict = lambda obj: {c.name: getattr(obj, c.name) for c in obj.__table__.columns}
try:
insert_stmt = insert(ScrapeStats).values(
**columns_to_dict(self.scrape_stats))
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
{'date_scraped': date.today(), 'time_scraped': pytz.timezone('GMT')} )
session.execute(on_duplicate_key_stmt)
session.commit()
return total_entries
Finally, here's the return I'm getting in my logs
Traceback (most recent call last):
File "C:\Users\sibla\Documents\CodingNomads\price_crawler\price_monitor\price_monitor\pipelines.py", line 112, in process_item
session.execute(on_duplicate_key_stmt)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\orm\session.py", line 1269, in execute
clause, params or {}
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 982, in execute
return meth(self, multiparams, params)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 287, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1092, in _execute_clauseelement
else None,
File "<string>", line 1, in <lambda>
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 462, in compile
return self._compiler(dialect, bind=bind, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 468, in _compiler
return dialect.statement_compiler(dialect, self, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 571, in __init__
Compiled.__init__(self, dialect, statement, **kwargs)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 319, in __init__
self.string = self.process(self.statement, **compile_kwargs)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 350, in process
return obj._compiler_dispatch(self, **kwargs)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\visitors.py", line 92, in _compiler_dispatch
return meth(self, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 2405, in visit_insert
self, insert_stmt, crud.ISINSERT, **kw
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 62, in _setup_crud_params
return _get_crud_params(compiler, stmt, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 177, in _get_crud_params
% (", ".join("%s" % c for c in check))
**sqlalchemy.exc.CompileError: Unconsumed column names: id, total_entries**
Upvotes: 2
Views: 1309
Reputation: 730
The reason your code is being called each time is because it's part of the pipelines.py
file and you're calling process_item
which runs every time an item is.. well.. processed.
My solution was to create a new class in the middleware.py
file and call the spider_closed
function. This function runs when a spider is closed and contains lots of spider defined information.
My code is below and part of the middleware.py
file. This should give a guide on how you could implement your self.
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
from scrapy import signals
import datetime
from sqlalchemy.orm import sessionmaker
from <botname>.models import Jobs, db_connect
class SaveJobInDatabase(object):
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def spider_closed(self, spider, reason):
db_arg = getattr(spider,'addtodatabase','true')
stats = spider.crawler.stats.get_stats()
job = {}
job['job_id'] = self.get_jobid()
job['start_timestamp'] = stats.get('start_time').strftime('%Y-%m-%d %H:%M:%S')
job['end_timestamp'] = stats.get('finish_time').strftime('%Y-%m-%d %H:%M:%S')
job['spider_name'] = spider.name
job['items_scraped'] = stats.get('item_scraped_count')
job['items_dropped'] = stats.get('item_dropped_count')
job['finish_reason'] = stats.get('finish_reason')
"""
Save jobs in the database.
This method is called whenever the spider is finished (closed)
"""
session = self.Session()
job_add = Jobs(**job)
session.add(job_add)
session.commit()
And make sure you update your settings.py
and added the class:
SPIDER_MIDDLEWARES = {
'botname.middlewares.SaveJobInDatabase': 300,
}
Upvotes: 2