Reputation:
New to scrapy and have looked everywhere over the past week or more for some solution to my problem. I am trying to scrape tabular data for ufc 1 at http://ufcstats.com/event-details/6420efac0578988b.
My spider is working fine and it returns each item as a list of strings. For example: 'winner': ['Royce Gracie', 'Jason DeLucia', 'Royce Gracie', 'Gerard Gordeau', 'Ken Shamrock', 'Royce Gracie', 'Kevin Rosier', 'Gerard Gordeau']} When I output to csv the event winners/losers/other stats are outputted as a list of strings in only 1 row. I want to output each item element in it's own row. I have been able to sort this out in pandas but feels unnecessarily worky and I have doubts that it will scale well.
Would like to be able to output to csv as it is in the table. Dunno if this should be done in the spider itself, in items/itemloaders or in pipelines.
Seems like a common issue but haven't been able to figure out a scrapy solution
Tried iterating in for loops in the spider code, with my standard itemloader, in item input processors and/or output processors and everything else that I could find in various examples from SO but haven't been able to achieve desired output. Was able to troubleshoot other prior issues though. Quite stuck and any help here would be greatly appreciated
#items.py
import scrapy
from scrapy.loader.processors import Identity, TakeFirst, Compose,
MapCompose, Join
def compact(s):
return s if s else None
class StatsItem(scrapy.Item):
# define the fields for your item here like:
event_name = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
event_date = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
event_loc = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
attendance = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
f_info = scrapy.Field(input_processor=MapCompose(str.strip, compact,),)
winner = scrapy.Field(input_processor=MapCompose(str.strip),)
loser = scrapy.Field(input_processor=MapCompose(str.strip),)
#spider code
import scrapy
from ..items import StatsItem
from scrapy.loader import ItemLoader
#from scrapy.loader.processors import Join, MapCompose, TakeFirst
class StatsSpider(scrapy.Spider):
name = 'stats'
allowed_domains = ['fcstats...']
start_urls = ['http://fcstats.../']
custom_settings = {
# specifies exported fields and order
'FEED_EXPORT_FIELDS':
['event_name','event_date','event_loc','attendance',
'winner',#'w_str', 'w_td', 'w_sub', 'w_pass', 'w_wclass', 'w_method', 'w_mthdtl', 'w_round', 'w_time',
'loser' ,#'l_str', 'l_td', 'l_sub', 'l_pass', 'l_wclass', 'l_method', 'l_mthdtl', 'l_round', 'l_time',
'f_info',]}
def parse(self, response):
rev_orderd_events = response.css('tr.b-statistics__table-row')[::-1]
# full event_links
# event_links = rev_orderd_events.css('i>a::attr(href)').extract()
# for url in event_links:
# yield scrapy.Request(url=event_links, callback=self.parse_event)
event_links = rev_orderd_events.css('i>a::attr(href)').extract_first()
yield scrapy.Request(url=event_links,callback=self.parse_event)
# follow links
def parse_event(self, response):
#sel = Selector(response)
pg = response.css('div.l-page__container')
#fights = response.css('tr.b-fight-details__table-row.b-fight-details__table-row__hover.js-fight-details-click')
#table = response.css('table.b-fight-details__table.b-fight-details__table_style_margin-top.b-fight-details__table_type_event-details.js-fight-table')
for match in pg:
il = ItemLoader(StatsItem(), response=response)
il.add_css('event_name','h2.b-content__title>span::text')
il.add_css('event_date','ul.b-list__box-list>li:nth-child(1)::text')
il.add_css('event_loc' ,'ul.b-list__box-list>li:nth-child(2)::text')
il.add_css('attendance','ul.b-list__box-list>li:nth-child(3)::text')
il.add_css('winner','p.b-fight-details__table-text:nth-child(odd)>a::text')
il.add_css('loser' ,'p.b-fight-details__table-text:nth-child(even)>a::text')
il.add_css('f_info', 'td p.b-fight-details__table-text::text')
yield il.load_item()
Actual result:
event_name event_date event_loc attendance winner loser f_info
UFC 1: The Beginning 12-Nov-93 Denver, Colorado, USA 2,800 Royce Gracie,Jason DeLucia,Royce Gracie,Gerard Gordeau,Ken Shamrock,Royce Gracie,Kevin Rosier,Gerard Gordeau Gerard Gordeau,Trent Jenkins,Ken Shamrock,Kevin Rosier,Patrick Smith,Art Jimmerson,Zane Frazier,Teila Tuli 1,0,1,0,1,0,2,0,Open Weight,SUB,Rear Naked Choke,1,1:44,3,1,1,0,1,0,1,0,Open Weight,SUB,Rear Naked Choke,1,0:52,0,0,0,0,1,0,2,0,Open Weight,SUB,Rear Naked Choke,1,0:57,11,0,0,0,0,0,0,0,Open Weight,KO/TKO,1,0:59,1,4,1,0,2,0,0,0,Open Weight,SUB,Heel Hook,1,1:49,0,0,1,0,0,0,2,0,Open Weight,SUB,Other,1,2:18,15,12,0,0,0,0,0,0,Open Weight,KO/TKO,1,4:20,3,0,0,0,0,0,0,0,Open Weight,KO/TKO,Kick,1,0:26
Expected Result would be more like:
event_name event_date event_loc attendance winner loser f_info
UFC 1: The Beginning 12-Nov-93 Denver, Colorado, USA 2,800 Royce Gracie, Gerard Gordeau, 1,0,1,0,1,0,2,0,Open Weight,SUB,Rear Naked Choke,1,1:44,
UFC 1: The Beginning 12-Nov-93 Denver, Colorado, USA 2,800 Jason DeLucia, Trent Jenkins 3,1,1,0,1,0,1,0,Open Weight,SUB,Rear Naked Choke,1,0:52 ....
*Edited for clarity
Upvotes: 0
Views: 461
Reputation:
thanks @umair and @Catalina_Chircu
def parse_event(self, response):
pg = response.css('div.l-page__container')
for event in response.css('div.b-fight-details'):
event_name = pg.css('h2.b-content__title>span::text').extract_first()
event_date = event.css('ul.b-list__box-list>li:nth-child(1)::text').extract()
event_loc = event.css('ul.b-list__box-list>li:nth-child(2)::text').extract()
attendance = event.css('ul.b-list__box-list>li:nth-child(3)::text').extract()
for fights in event.css('tr')[1:]:
il = ItemLoader(StatsItem(), selector=fights)
il.add_value('event_name', event_name)
il.add_value('event_date', event_date)
il.add_value('event_loc', event_loc)
il.add_value('attendance', attendance)
il.add_css('winner', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(odd)>a::text')
il.add_css('loser', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(even)>a::text')
#il.add_css('f_info', ':nth-child(3) p.b-fight-details__table-text::text')
il.add_css('w_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(odd)::text')
il.add_css('l_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(even)::text')
il.add_css('w_td' ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(odd)::text')
il.add_css('l_td' ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(even)::text')
il.add_css('w_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(odd)::text')
il.add_css('l_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(even)::text')
il.add_css('w_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(odd)::text')
il.add_css('l_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(even)::text')
il.add_css('w_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
il.add_css('l_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
il.add_css('w_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
il.add_css('l_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
il.add_css('w_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
il.add_css('l_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
il.add_css('w_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
il.add_css('l_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
il.add_css('w_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
il.add_css('l_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
yield il.load_item()'
with associated items input/output processors is giving me most of what I was hoping for
Upvotes: 0
Reputation: 21351
I have been working in Scrapy for many years and I find this Item class useless and very confusing, specially for the ones who are new to Scrapy
In your case, you need to iterate over winner and loser elements in a for loop and yield then one by one
class StatsSpider(scrapy.Spider):
name = 'stats'
allowed_domains = ['ufcstats.com']
start_urls = ['http://ufcstats.com/statistics/events/completed?page=all']
def parse(self, response):
rev_orderd_events = response.css('tr.b-statistics__table-row')[::-1]
event_links = rev_orderd_events.css('i>a::attr(href)').extract_first()
yield scrapy.Request(url=event_links,callback=self.parse_event)
# follow links
def parse_event(self, response):
pg = response.css('div.l-page__container')
for match in pg:
event_name = item.css("h2.b-content__title>span::text").extract_first()
event_date = item.css("ul.b-list__box-list>li:nth-child(1)").extract_first()
event_loc = item.css("ul.b-list__box-list>li:nth-child(2)::text").extract_first()
for item in match.css("p.b-fight-details__table-text:nth-child(odd)>a"):
winner = {}
winner['name'] = item.css("::text").extract_first()
winner['type'] = 'winner'
winner['event_name'] = event_name
winner['event_date'] = event_date
winner['event_loc'] = event_loc
yield winner
for item in match.css("p.b-fight-details__table-text:nth-child(even)>a"):
loser = {}
loser['name'] = item.css("::text").extract_first()
winner['type'] = 'loser'
loser['event_name'] = event_name
loser['event_date'] = event_date
loser['event_loc'] = event_loc
yield loser
Upvotes: 1