Reputation: 434
I am trying to parse information from multiple pages using scrapy. But it doesn't seem to yield an item after it is finished. What could be the issue?
class TransfersSpider(scrapy.Spider):
name = "transfers"
start_urls = 'https://www.transfermarkt.com/kevin-de-bruyne/profil/spieler/88755'
def parse(self, response):
item = PlayerTransfersItem()
info_table = response.css('div[class="info-table info-table--right-space "]') or response.css('div[class="large-6 large-pull-6 small-12 columns spielerdatenundfakten"]')
item["name"] = response.xpath('//h1/text()').get().strip() + " " + response.xpath('//h1//following-sibling::strong/text()').get(default = "")
stats_url = response.url.replace('profil', 'leistungsdaten') #this url will be used to find the all seasons this player played in
yield scrapy.Request(stats_url, callback= self.parse_played_seasons, cb_kwargs={"item": item})
def parse_played_seasons(self, response, item): #Obtain every season the player has played in
item["seasons_stats"] = list()
seasons = response.css('div[class="inline-select"] > select[name="saison"] >option::attr(value)').getall() # Total seasons player played in
for season in seasons: # parse stats from each season
url = f"{response.url}/plus/0?saison={season}"
yield scrapy.Request(url, callback=self.parse_season, cb_kwargs= {"item": item, "season": season})
yield item #This returns a None value
def parse_season(self, response, item, season):
tables = response.css('div[class="box"] > div[class="responsive-table"]')
total_table = tables[0].css('tr> td::text').get()
if "Total" in total_table: #If there is a table with a row shwoing total results
appearances_total_table = tables[0].css('tr> td[class="zentriert"]::text').get()
goals_total_table = tables[0].css('tr> td[class="zentriert"]::text')[1].get()
assists_total_table = tables[0].css('tr> td[class="zentriert"]::text')[2].get()
season_stats = { season:{"total_goals": goals_total_table,"total_assists" : assists_total_table, "minutes_played": minutes_total_table,
"appearances": appearances_total_table}}
item["seasons_stats"].append(season_stats)
I want to get the stats of the player in each season, so why does it return a none value. But when I place yield in the parse_season
function, it return duplicates of the item in each season.
Upvotes: 0
Views: 447
Reputation: 434
First add this function to your pipeline. It will add all the season_stats
of players with the same name.
import json
def combine(L):
results = {}
for item in L:
key = (item["name"])
if key in results: # combine them
total_ = item["season_stats"] + results[key]["season_stats"]
total= [i for n, i in enumerate(total_) if i not in total_[n + 1:]]
results[key] = {"name": item["name"], "season_stats":total }
else: # don't need to combine them
results[key] = item
return list(results.values())
Then modify your pipeline to append them to a list where this function will be performed, then dumped to a json.
class Pipeline:
def __init__(self):
self.players = []
def process_item(self, item, spider):
self.players.append(item)
#return item
def close_spider(self, spider):
print(self.players)
with open("myjsonfile.json", "wt") as fd:
json.dump(combine(self.players), fd)
Output json file:
[{"name": "#9 Erling Haaland", "season_stats":
[{"2021": {"Appearances": "30", "Goals": "29", "Assists": "8"}},
{"2020": {"Appearances": "41", "Goals": "41", "Assists": "12"}},
{"2022": {"Appearances": "10", "Goals": "14", "Assists": "1"}}]},
{"name": "#17 Kevin De Bruyne", "season_stats":
[{"2020": {"Appearances": "40", "Goals": "10", "Assists": "18"}},
{"2021": {"Appearances": "45", "Goals": "19", "Assists": "14"}},
{"2022": {"Appearances": "10", "Goals": "1", "Assists": "8"}}]}]
Upvotes: 0
Reputation: 17291
Try collecting the data from the first pages in a regular dictionary at first and passing that dictionary as the argument to cb_kwargs. Then on the final you can create the Item and transfer that data into it. I also found some of your css
and xpath
expressions were overly complicated and difficult to decipher, so I simplified some of them in my example below.
You also should yield the item only once you have reached the last page and collected all of the data.
For example:
class TransfersSpider(scrapy.Spider):
name = "transfers"
start_urls = ['https://www.transfermarkt.com/kevin-de-bruyne/profil/spieler/88755']
def parse(self, response):
info_table = response.css('div[class="info-table info-table--right-space "]') or response.css('div[class="large-6 large-pull-6 small-12 columns spielerdatenundfakten"]')
name = response.xpath('//h1//text()').getall()
name = ' '.join([i.strip() for i in name if i.strip()])
stats_url = response.url.replace('profil', 'leistungsdaten')
yield scrapy.Request(stats_url, callback= self.parse_played_seasons, cb_kwargs={"name": name})
def parse_played_seasons(self, response, **kwargs):
seasons = response.xpath('//option/@value').getall()
for season in seasons:
kwargs["season"] = season
url = f"{response.url}/plus/0?saison={season}"
yield scrapy.Request(url, callback=self.parse_season, cb_kwargs=kwargs)
def parse_season(self, response, name=None, season=None, **kwargs):
item = PlayerTransfersItem()
item['name'] = name
item['season'] = season
footer = response.xpath('//div[@id="yw1"]//tfoot')
fields = ["Appearances", "Goals", "Assists", "Yellow Cards", "Second Yellow Cards", "Red Cards"]
values = footer.xpath('./tr/td[@class="zentriert"]/text()').getall()
seconds = footer.xpath("./tr/td[@class='rechts']/text()").getall()[1]
season_stats = {i:j for i,j in zip(fields,values)}
season_stats["Seconds"] = seconds
item['season_stats'] = season_stats
yield item
pipelines.py
import json
class Pipeline:
def __init__(self):
self.players = {}
def process_item(self, item, spider):
name = item["name"]
self.players.setdefault(name, [])
self.players[name].append(dict(item))
return item
def close_spider(self, spider):
print(self.players)
with open("myjsonfile.json", "wt") as fd:
json.dump(self.players, fd)
OUTPUT
2022-09-20 11:43:53 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-09-20 11:43:53 [scrapy.extensions.telnet] INFO: Telnet Password: 93fbcda50cf7bb12
2022-09-20 11:43:53 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2022-09-20 11:43:53 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-09-20 11:43:53 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-09-20 11:43:53 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2022-09-20 11:43:53 [scrapy.core.engine] INFO: Spider opened
2022-09-20 11:43:53 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-09-20 11:43:53 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-09-20 11:43:54 [filelock] DEBUG: Attempting to acquire lock 2507146387152 on C:\Users\asp\Documents\Code\spiders\venv\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.js
on.lock
2022-09-20 11:43:54 [filelock] DEBUG: Lock 2507146387152 acquired on C:\Users\asp\Documents\Code\spiders\venv\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-09-20 11:43:54 [filelock] DEBUG: Attempting to release lock 2507146387152 on C:\Users\asp\Documents\Code\spiders\venv\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.js
on.lock
2022-09-20 11:43:54 [filelock] DEBUG: Lock 2507146387152 released on C:\Users\asp\Documents\Code\spiders\venv\lib\site-packages\tldextract\.suffix_cache/publicsuffix.org-tlds\de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-09-20 11:43:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/profil/spieler/88755> (referer: None)
2022-09-20 11:43:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755> (referer: https://www.transfermarkt.com/kevin-de-bruyne/profil/spieler/88755)
2022-09-20 11:43:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=ges> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/sp
ieler/88755)
2022-09-20 11:43:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=ges>
{'name': '#17 Kevin De Bruyne',
'season': 'ges',
'season_stats': {'Appearances': '546',
'Assists': '213',
'Goals': '134',
'Red Cards': '-',
'Second Yellow Cards': '2',
'Seconds': "41.606'",
'Yellow Cards': '53'}}
2022-09-20 11:43:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2015> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2015>
{'name': '#17 Kevin De Bruyne',
'season': '2015',
'season_stats': {'Appearances': '45',
'Assists': '15',
'Goals': '17',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "3.486'",
'Yellow Cards': '4'}}
2022-09-20 11:43:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2016> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2016>
{'name': '#17 Kevin De Bruyne',
'season': '2016',
'season_stats': {'Appearances': '49',
'Assists': '21',
'Goals': '7',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "3.910'",
'Yellow Cards': '6'}}
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2017> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2017>
{'name': '#17 Kevin De Bruyne',
'season': '2017',
'season_stats': {'Appearances': '52',
'Assists': '21',
'Goals': '12',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "4.180'",
'Yellow Cards': '6'}}
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2010> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2009> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2011> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2008> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2018> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2013> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2010>
{'name': '#17 Kevin De Bruyne',
'season': '2010',
'season_stats': {'Appearances': '35',
'Assists': '17',
'Goals': '6',
'Red Cards': '-',
'Second Yellow Cards': '1',
'Seconds': "2.563'",
'Yellow Cards': '3'}}
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2009>
{'name': '#17 Kevin De Bruyne',
'season': '2009',
'season_stats': {'Appearances': '40',
'Assists': '4',
'Goals': '3',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "1.992'",
'Yellow Cards': '2'}}
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2011>
{'name': '#17 Kevin De Bruyne',
'season': '2011',
'season_stats': {'Appearances': '36',
'Assists': '15',
'Goals': '8',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "2.990'",
'Yellow Cards': '3'}}
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2008>
{'name': '#17 Kevin De Bruyne',
'season': '2008',
'season_stats': {'Appearances': '2',
'Assists': '-',
'Goals': '-',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "15'",
'Yellow Cards': '-'}}
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2019> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2018>
{'name': '#17 Kevin De Bruyne',
'season': '2018',
'season_stats': {'Appearances': '32',
'Assists': '11',
'Goals': '6',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "1.828'",
'Yellow Cards': '2'}}
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2013>
{'name': '#17 Kevin De Bruyne',
'season': '2013',
'season_stats': {'Appearances': '27',
'Assists': '8',
'Goals': '3',
'Red Cards': '-',
'Second Yellow Cards': '1',
'Seconds': "2.017'",
'Yellow Cards': '4'}}
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2014> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2019>
{'name': '#17 Kevin De Bruyne',
'season': '2019',
'season_stats': {'Appearances': '48',
'Assists': '23',
'Goals': '16',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "3.826'",
'Yellow Cards': '5'}}
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2014>
{'name': '#17 Kevin De Bruyne',
'season': '2014',
'season_stats': {'Appearances': '51',
'Assists': '28',
'Goals': '16',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "4.550'",
'Yellow Cards': '7'}}
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2020> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2021> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2020>
{'name': '#17 Kevin De Bruyne',
'season': '2020',
'season_stats': {'Appearances': '40',
'Assists': '18',
'Goals': '10',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "3.043'",
'Yellow Cards': '3'}}
2022-09-20 11:43:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2022> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2021>
{'name': '#17 Kevin De Bruyne',
'season': '2021',
'season_stats': {'Appearances': '45',
'Assists': '14',
'Goals': '19',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "3.338'",
'Yellow Cards': '4'}}
2022-09-20 11:43:56 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2022>
{'name': '#17 Kevin De Bruyne',
'season': '2022',
'season_stats': {'Appearances': '10',
'Assists': '8',
'Goals': '1',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "782'",
'Yellow Cards': '-'}}
2022-09-20 11:43:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2012> (referer: https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/s
pieler/88755)
2022-09-20 11:43:57 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.com/kevin-de-bruyne/leistungsdaten/spieler/88755/plus/0?saison=2012>
{'name': '#17 Kevin De Bruyne',
'season': '2012',
'season_stats': {'Appearances': '34',
'Assists': '10',
'Goals': '10',
'Red Cards': '-',
'Second Yellow Cards': '-',
'Seconds': "3.086'",
'Yellow Cards': '4'}}
2022-09-20 11:43:57 [scrapy.core.engine] INFO: Closing spider (finished)
2022-09-20 11:43:57 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8755,
'downloader/request_count': 18,
'downloader/request_method_count/GET': 18,
'downloader/response_bytes': 1298265,
'downloader/response_count': 18,
'downloader/response_status_count/200': 18,
'elapsed_time_seconds': 3.990176,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 9, 20, 18, 43, 57, 321615),
'httpcompression/response_bytes': 6118618,
'httpcompression/response_count': 18,
'item_scraped_count': 16,
'log_count/DEBUG': 39,
'log_count/INFO': 10,
'request_depth_max': 2,
'response_received_count': 18,
'scheduler/dequeued': 18,
'scheduler/dequeued/memory': 18,
'scheduler/enqueued': 18,
'scheduler/enqueued/memory': 18,
'start_time': datetime.datetime(2022, 9, 20, 18, 43, 53, 331439)}
2022-09-20 11:43:57 [scrapy.core.engine] INFO: Spider closed (finished)
You can add a item pipeline like the one above and store the results for each generated item, and dump them all to a json file. Be sure to activate the pipeline in your settings.py
file.
ITEM_PIPELINES = {
'myproject.pipelines.Pipeline': 100
}
Upvotes: 1