Reputation: 449
Going through scrapy tutorials and started using item loaders to gather data. The data that I am using involves reading from a pre-defined dictionary which I load through JSON and a product page that is followed by the spider.
The issue that I am having is that the dictionary will sometimes not have a key available (like 'salePrice') that causes a KeyError in the crawl and stops the execution altogether. I'm trying to see if there is a clean way to handle KeyErrors for this field in items.py
where input_processors and output_processors are specified for each field.
Would appreciate any suggestions or examples!
import json
import re
import time
import scrapy
from scrapy.loader import ItemLoader
from tutorial.items import Product
class SephoraSpider(scrapy.Spider):
name = 'sephora-shelf'
start_urls = [
'https://www.sephora.com/shop/moisturizing-cream-oils-mists/?currentPage=1'
]
next_page_number = 1
base_url = 'https://www.sephora.com'
def parse(self, response):
json_xpath = '//script[@type="text/json" and @id="linkSPA"]/text()'
product_container = json.loads(response.xpath(json_xpath).extract()[0])
product_container = product_container['NthCategory']['props']['products']
start_time = round(time.time())
print("starting loop")
for _product in product_container:
product = Product()
loader = ItemLoader(item=Product(), response=response)
loader.add_value('list_price', _product['currentSku']['listPrice'])
loader.add_value('sale_price', _product['currentSku']['salePrice'])
loader.add_value('sku_id', _product['currentSku']['skuId'])
loader.add_value('product_key', _product['productId'])
loader.add_value('product_name', _product['displayName'])
loader.add_value('brand_name', _product['brandName'])
loader.add_value('product_id', _product['productId'])
_product_url = self.base_url + _product['targetUrl']
loader.add_value('product_url', _product_url)
loader.add_value('status', None)
print("finished loading product")
# TODO: add a check to see if it was on the previous run's data
# to determine if it is product status: added / deleted.
# Only collect product data if the product is newly added.
yield response.follow(_product_url, callback=self.parse_product,
meta={'item':loader.load_item()})
next_page_xpath = '//button[@type="button" and @aria-label="Next"]'
next_page_button = response.xpath(next_page_xpath)
print(f'next_page_button: {next_page_button}')
if next_page_button:
print("Inside next_page_button")
SephoraSpider.next_page_number += 1
next_page = re.sub('\?currentPage=[0-9]*',
'?currentPage=' +
str(SephoraSpider.next_page_number),
response.request.url)
print(f"Next Page: {next_page}")
yield response.follow(next_page, callback=self.parse)
def parse_product(self, response):
loader = ItemLoader(item=response.meta['item'],
response=response)
loader.add_xpath('item_id', '//div[@data-at="sku_size"]')
time.sleep(3)
yield loader.load_item()
Upvotes: 1
Views: 516
Reputation: 449
One simple workaround is to use the .get() method of dictionary and have it default to None when the key is missing. Still not convinced if this is a proper way of handling this kind of error when working with scrapy.
Before: loader.add_value('sale_price', _product['currentSku']['salePrice'])
After: loader.add_value('sale_price', _product.get('currentSku').get('salePrice', None))
Upvotes: 2