rmercier
rmercier

Reputation: 175

Duplicate items saved when using nested parsers in Scrapy

I'm having an issue with Scrapy and the way it outputs items.

Here is my items.py:

import scrapy
class Club(scrapy.Item):
    name = scrapy.Field()
    url = scrapy.Field()
    logo = scrapy.Field()
    players = scrapy.Field()

Here is my only spider:

import scrapy
from lequipefr.items import Club


class NamesSpider(scrapy.Spider):
    name = "names"
    allowed_domains = ['lequipe.fr']

    def start_requests(self):
        urls = ['https://www.lequipe.fr/Football/FootballFicheClub26.html']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_club)


    def parse_club(self, response):
        club = Club()
        club['url'] = response.url
        club['name'] = response.css('.nom_sportif::text').get()
        club['logo'] = response.css('.visuels-club').xpath('./figure/img/@src').get()
        club['players'] = []
        for href in response.css('.effectifclub').css('.nom').xpath('./a/@href').getall():
            request = response.follow(href, callback=self.parse_player)
            request.meta['item'] = club
            yield request


    def parse_player(self, response):
        club = response.meta['item']
        playerDict = {}
        playerDict['url'] = response.url
        playerDict['name'] = response.css('.nom_sportif::text').get()
        playerDict['number'] = response.css('.identite').xpath("//*[contains(text(), 'Numéro')]").xpath('./strong/text()').get()
        playerDict['photo'] = response.css('.visuel').xpath('./figure/img/@src').get()
        club['players'].append(playerDict)
        yield club

And here is my JSON output:

[
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur26337.html", "name": "Edinson Cavani", "number": "9", "photo": "//medias.lequipe.fr/img-sportif-foot/26337/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur26337.html", "name": "Edinson Cavani", "number": "9", "photo": "//medias.lequipe.fr/img-sportif-foot/26337/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur58221.html", "name": "Kylian Mbapp\u00e9", "number": "7", "photo": "//medias.lequipe.fr/img-sportif-foot/58221/110"}]}
]

Instead, this is what I would like my output to be:

[
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur26337.html", "name": "Edinson Cavani", "number": "9", "photo": "//medias.lequipe.fr/img-sportif-foot/26337/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur58221.html", "name": "Kylian Mbapp\u00e9", "number": "7", "photo": "//medias.lequipe.fr/img-sportif-foot/58221/110"}]}
]

As you can see, instead of appending the "player" dicts to the same item and then yielding it once, I end up with item duplicates for each iteration in my .json output file.

How would I go about getting this kind of nested structure in my item without the duplicates in my output?

Upvotes: 0

Views: 70

Answers (1)

vezunchik
vezunchik

Reputation: 3717

You yield item with player for each player, so yes, you will have problem with desired output.

I can recommend to use inline_requests library. Documentation is here: https://pypi.org/project/scrapy-inline-requests/ It allows you to make requests to players pages from parent function and return result in parent function.

Check this working solution:

# -*- coding: utf-8 -*-
import scrapy
from inline_requests import inline_requests


class NamesSpider(scrapy.Spider):
    name = "names"
    allowed_domains = ['lequipe.fr']

    def start_requests(self):
        urls = ['https://www.lequipe.fr/Football/FootballFicheClub26.html']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_club)

    @inline_requests
    def parse_club(self, response):
        club = {}
        club['url'] = response.url
        club['name'] = response.css('.nom_sportif::text').get()
        club['logo'] = response.css('.visuels-club').xpath('./figure/img/@src').get()
        club['players'] = []
        for href in response.css('.effectifclub').css('.nom').xpath('./a/@href').getall():
            url = response.urljoin(href)
            request = yield scrapy.Request(url)
            playerDict = {}
            playerDict['url'] = url
            playerDict['name'] = request.css('.nom_sportif::text').get()
            playerDict['number'] = request.css('.identite').xpath(u"//*[contains(text(), 'Numéro')]").xpath(
                './strong/text()').get()
            playerDict['photo'] = request.css('.visuel').xpath('./figure/img/@src').get()
            club['players'].append(playerDict)

        yield club

Upvotes: 1

Related Questions