Reputation: 525
I have an agenda as a starting page. This page contains the start times and titles of events and links to the detail page of each event.
My spider extracts all events details (description, location, etc) on the detail page of each single event, except the start time i have to extract on my start page.
How can i extract start time from the start page and other data on each detail pages ? What is the scrappy way to go ? Using meta['item'] ? i don't get it... This is my spider for now. Any help greatly appreciated!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
Edit:
I tried to extract start time from the start page using request.meta['item']
and get a list of all the start time in the start page for each event. How to get the start time for each event ?
Can someone show me the right direction ?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
Upvotes: 2
Views: 833
Reputation: 525
This worked :
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[@class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[@class="toggle_container_show"]/div/a/@href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
Upvotes: 1
Reputation: 3875
You are right. Using meta would do it in your case. Please see the official documentation here: http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
Upvotes: 3