Reputation: 125
I have below file and code
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging:
def __init__(self, item_count):
self.item_count = item_count
self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
print('Hey I am called')
# first check if the extension should be enabled and raise
# NotConfigured otherwise
# if not crawler.settings.getbool('MYEXT_ENABLED'):
# raise NotConfigured
# get the number of items from settings
item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
# instantiate the extension object
ext = cls(crawler.settings,crawler.stats)
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
# return the extension object
return ext
def spider_opened(self, spider):
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % self.item_count == 0:
logger.info("scraped %d items", self.items_scraped)
and I have change the settings
MYEXT_ENABLED = True
EXTENSIONS = {
'project.custom_extension.SpiderOpenCloseLogging': 300
}
But no signal is being called, I have checked the path being given in settings, spiders are being called
event the print I have given is not being logged
Can someone please suggest what I am missing
Thanks
Upvotes: 1
Views: 372
Reputation: 296
All the signals are called from my adaption to your script. There were a few mistakes that you made, which to me did not make any sense as you were not specifying anything specific. Which is why you was getting no signal but rather errors:
A few mistakes:
1.
def __init__(self, item_count, stats):
self.item_count = item_count
#self.items_scraped = 0 --- change this
self.items_scraped = stats
def item_scraped(self, item, spider):
# self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
# if self.items_scraped % self.item_count == 0: --- these should be the other way around
logger.info("scraped %d items", self.items_scraped)
#additional note;
#--- you did not substantiate self.item_count, putting item_count
#in from_crawler does not work. Because you are returning ext, so
#self.item_count takes crawler.settings rather than item_count. So
#you will get an error.
i. By updating, we have the following corrections:
def __init__(self, item_count, stats): # if you want to include crawler.stats
self.item_count = item_count
self.items_scraped = stats
ii.
def spider_opened(self, spider):
self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
if self.items_scraped is None:
self.items_scraped = 0 #then instantiate with 0
self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
logger.info("opened spider %s", spider.name)
iii.
def item_scraped(self, item, spider):
logger.info(f"scraped few {self.items_scraped} items")
self.items_scraped += 1
if self.item_count % self.items_scraped == 0: # these have been flipped
logger.info(f"scraped increments {self.items_scraped} items")
Example when putting this altogether:
import logging
from scrapy import signals
import scrapy
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging(scrapy.Spider):
name = 'log_signals'
start_urls = [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]
def __init__(self, item_count, stats):
self.item_count = item_count
self.items_scraped = stats
#self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler.settings,crawler.stats)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
return ext
def spider_opened(self, spider):
self.items_scraped = self.items_scraped.get_value('item_scraped_count')
if self.items_scraped is None:
self.items_scraped = 0
self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
logger.info(f"scraped few {self.items_scraped} items")
self.items_scraped += 1
if self.item_count % self.items_scraped == 0:
#print(f"scraped increments {self.items_scraped} items")
logger.info(f"scraped increments {self.items_scraped} items")
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[@class = "row"]//div')
for items in content:
yield {
'some_items_links':items.xpath(".//a//@href").get()
}
Output:
.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...
Upvotes: 1