Reputation: 466
I have this object I'm trying to populate with an itemLoader:
{
"domains": "string",
"date_insert": "2016-12-23T11:25:00.213Z",
"title": "string",
"url": "string",
"body": "string",
"date": "2016-12-23T11:25:00.213Z",
"authors": [
"string"
],
"categories": [
"string"
],
"tags": [
"string"
],
"stats": {
"views_count": 0,
"comments_count": 0
}
}
Here's my items.py
class StatsItem(scrapy.Item):
views_count=scrapy.Field()
comments_count=scrapy.Field()
class ArticleItem(scrapy.Item):
domain = scrapy.Field()
date_insert=scrapy.Field()
date_update=scrapy.Field()
date=scrapy.Field()
title=scrapy.Field()
url=scrapy.Field()
body=scrapy.Field(
output_processor=Join())
date=scrapy.Field()
authors=scrapy.Field(
output_processor=Identity())
categories=scrapy.Field(
output_processor=Identity())
tags=scrapy.Field()
stats=scrapy.Field()
Part of my spider:
def parse(self, response):
loader = ArticleItemLoader(response=response)
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
loader.add_css('authors','span.meta-author')
loader.add_css('title', 'h1.title-article')
loader.add_value('url', response.url)
loader.add_xpath('date_insert', '//div[@class=\'meta\']/time[@itemprop=\'datePublished\']/@datetime')
loader.add_xpath('date_update', '//div[@class=\'meta\']/time[@itemprop=\'dateModified\']/@datetime')
loader.add_value('domain', domain)
loader.add_xpath('categories', '//ul[@class=\'breadcrumbs\']//li[not(contains(@class, \'home\'))]')
So far I have succesfuly populating every fields but "stats". I've checked this page correct way to nest Item data in scrapy but it seems to not be working anymore (I can't make it work, my error is TypeError: to_unicode must receive a bytes, str or unicode object, got StatsItem)
I'd like to use the itemLoader but I dont see how I could populate my "stats" with my StatsItem
Thx for the help
Edit I am close but it still doesnt work :
loader.add_value('stats', self.getStats(response))
def getStats(self, response):
statsLoader = StatsItemLoader(response=response)
statsLoader.add_xpath('comments_count', '//div[@class=\'btn-count\']//a/text()')
statsLoader.add_value('views_count', '42')
return json.dumps(dict(statsLoader.load_item()))
but my output is like : { [...] "stats": "{\"comments_count\": \"0\", \"views_count\": \"42\"}" }
Upvotes: 3
Views: 1593
Reputation: 466
Thanks to @eLRuLL I manage to find a decent solution :
items.py :
class StatsItem(scrapy.Item):
views_count=scrapy.Field()
comments_count=scrapy.Field()
class ArticleItem(scrapy.Item):
[...]
stats=scrapy.Field(
input_processor=Identity())
class StatsItemLoader(ItemLoader):
default_input_processor=MapCompose(remove_tags)
default_output_processor=TakeFirst()
default_item_class=StatsItem
spider.py:
def parse(self, response):
[...]
loader.add_value('stats', self.getStats(response))
[...]
def getStats(self, response):
statsLoader = StatsItemLoader(response=response)
statsLoader.add_xpath('comments_count', '//div[@class=\'btn-count\']//a/text()')
statsLoader.add_value('views_count', '42')
return dict(statsLoader.load_item())
Originally it was not working because my input_processor was MapCompose(remove_tags)
for the stats field. In order to serialize the object you have to return dict(loader.load_item())
and not just return loader.load_item()
Thanks !
Upvotes: 4