Writing scraping stats to a database using scrapy StatsCollector

Question

I'm completely new to Stackoverflow and the world of coding so apologies in advance if this question is formatted poorly!

I've built spiders that crawl websites for key information then write the output to a database. I'd now like to write some key stats from each scrape to a separate table in the database.

I've tried using scrapy Stats Collector but so far haven't been able to get anything written to the database, and I'm struggling to find any examples of this.

Can anyone help in pointing me in the right direction? Below is an example trying to write just one column (total entries that the spider found during a scrape). The crawler is successful, but my statscollector isn't writing anything. Specifically I'd love to know:

1) Am I approaching this in the right way, particularly as I add more spiders and more complexity to the project

2) How do I only write to the database one stat per crawl (e.g. total number of products found). Currently I believe it's trying to write to the database each time parse_main_item is called, which would be dozens of times for dozens of sub categories.

Thanks in advance for your help!

# One of my spiders
class Retailer1(scrapy.Spider):
    name = 'R1_monitor'
    allowed_domains = ['example.com']
    start_urls = ['https://www.example.com']

    def __init__(self):
        self.declare_xpath()
    def declare_xpath(self):
        #site specific xpaths

    def parse(self, response):
# returns all the category links in the home page menu
            yield scrapy.Request(url=url, callback=self.parse_category, )

    def parse_category(self, response):
# returns all the sub category links sitting beneath each category
            yield scrapy.Request(url, callback=self.parse_page_url, )

    def parse_page_url(self, response):
# uses a product count in the html to return a url that will display all products
            yield scrapy.Request(paginated_url, callback=self.parse_main_item, dont_filter=True,)

    def parse_main_item(self, response):
        items_by_id = {}

        for product in zip(
                #zips things like product name, price etc together ):

            item = PriceMonitorItem()
            item['product_id'] = product[0]
            item['product_name'] = product[1]
            item['product_url'] = product[2]
            item['product_image'] = product[3]
            item['retailer_site'] = host
            items_by_id[product[0]] = item

            #yield all of the items from the main parse function
            for items in items_by_id.values():
                yield items

#Here I'd like to count the total number of items that are successfully parsed and sent to the pipeline
        self.total_items = 0
        self.total_items += len(items_by_id)

        total_entries = self.crawler.stats.set_value("Total items scraped", self.total_items, spider=DdcMonitorSpider)
        yield total_entries

In pipelines.py

class PriceCrawlerStatsPipeline(object):
    def __init__(self):
        #db session is created

    def process_item(self, total_entries, spider):
        session = self.Session()
        **self.scrape_stats = ScrapeStats()
        self.scrape_stats.total_entries = total_entries**
        columns_to_dict = lambda obj: {c.name: getattr(obj, c.name) for c in obj.__table__.columns}

        try:
            insert_stmt = insert(ScrapeStats).values(
                **columns_to_dict(self.scrape_stats))

            on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
                {'date_scraped': date.today(), 'time_scraped': pytz.timezone('GMT')} )

            session.execute(on_duplicate_key_stmt)
            session.commit()
        return total_entries

Finally, here's the return I'm getting in my logs

Traceback (most recent call last):
  File "C:\Users\sibla\Documents\CodingNomads\price_crawler\price_monitor\price_monitor\pipelines.py", line 112, in process_item
    session.execute(on_duplicate_key_stmt)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\orm\session.py", line 1269, in execute
    clause, params or {}
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 982, in execute
    return meth(self, multiparams, params)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 287, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1092, in _execute_clauseelement
    else None,
  File "", line 1, in 
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 462, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 468, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 571, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 319, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 350, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\visitors.py", line 92, in _compiler_dispatch
    return meth(self, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 2405, in visit_insert
    self, insert_stmt, crud.ISINSERT, **kw
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 62, in _setup_crud_params
    return _get_crud_params(compiler, stmt, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 177, in _get_crud_params
    % (", ".join("%s" % c for c in check))
**sqlalchemy.exc.CompileError: Unconsumed column names: id, total_entries**

Writing scraping stats to a database using scrapy StatsCollector

Answers (1)

Related Questions