Reputation: 7789
How can I tell Scrapy to divide all yielded items into two lists? For instance, let's say I have two main types of items - article
and author
. I want to have them in two separate lists. Right now I am getting output JSON:
[
{
"article_title":"foo",
"article_published":"1.1.1972",
"author": "John Doe"
},
{
"name": "John Doe",
"age": 42,
"email": "[email protected]"
}
]
How do I convert it to something like this?
{
"articles": [
{
"article_title": "foo",
"article_published": "1.1.1972",
"author": "John Doe"
}
],
"authors": [
{
"name": "John Doe",
"age": 42,
"email": "[email protected]"
}
]
}
My functions for outputting these are simple, similar to this:
def parse_author(self, response):
name = response.css('div.author-info a::text').extract_first()
print("Parsing author: {}".format(name))
yield {
'author_name': name
}
Upvotes: 1
Views: 4444
Reputation: 5240
Items will reach the pipeline separately and add each accordingly with this setup:
items.py
class Article(scrapy.Item):
title = scrapy.Field()
published = scrapy.Field()
author = scrapy.Field()
class Author(scrapy.Item):
name = scrapy.Field()
age = scrapy.Field()
spider.py
def parse(self, response):
author = items.Author()
author['name'] = response.css('div.author-info a::text').extract_first()
print("Parsing author: {}".format(author['name']))
yield author
article = items.Article()
article['title'] = response.css('article css').extract_first()
print("Parsing article: {}".format(article['title']))
yield article
pipelines.py
process_item(self, item, spider):
if isinstance(item, items.Author):
# Do something to authors
elif isinstance(item, items.Article):
# Do something to articles
I suggest though this architecture:
[{
"title": "foo",
"published": "1.1.1972",
"authors": [
{
"name": "John Doe",
"age": 42,
"email": "[email protected]"
},
{
"name": "Jane Doe",
"age": 21,
"email": "[email protected]"
},
]
}]
This makes it go all in one item.
items.py
class Article(scrapy.Item):
title = scrapy.Field()
published = scrapy.Field()
authors = scrapy.Field()
spider.py
def parse(self, response):
authors = []
author = {}
author['name'] = "John Doe"
author['age'] = 42
author['email'] = "[email protected]"
print("Parsing author: {}".format(author['name']))
authors.append(author)
article = items.Article()
article['title'] = "foo"
article['published'] = "1.1.1972"
print("Parsing article: {}".format(article['title']))
article['authors'] = authors
yield article
Upvotes: 3
Reputation: 21261
raw = [
{
"article_title":"foo",
"article_published":"1.1.1972",
"author": "John Doe"
},
{
"name": "John Doe",
"age": 42,
"email": "[email protected]"
}
]
data = {'articles':[], "authors":[]}
for a in raw:
if 'article_title' in a:
data['articles'].extend([ a ])
else:
data['articles'].extend([ a ])
Upvotes: 1