Extract data from two pages with Scrapy

Question

I have an agenda as a starting page. This page contains the start times and titles of events and links to the detail page of each event.

My spider extracts all events details (description, location, etc) on the detail page of each single event, except the start time i have to extract on my start page.

How can i extract start time from the start page and other data on each detail pages ? What is the scrappy way to go ? Using meta['item'] ? i don't get it... This is my spider for now. Any help greatly appreciated!

class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]

def parse(self, response):  
    for href in response.css("div.toggle_container_show > div > a::attr('href')"):
        url = response.urljoin(href.extract())
        yield scrapy.Request(url, callback=self.parse_agenda_contents)

def parse_agenda_contents(self, response):
    for sel in response.xpath('//div[@class="container"]'):
        item = LuItem()
        item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
        item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
        yield item

Edit:

I tried to extract start time from the start page using request.meta['item'] and get a list of all the start time in the start page for each event. How to get the start time for each event ? Can someone show me the right direction ?

class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]

def parse(self, response):
        item = LuItem()
        item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()

        for href in response.css("div.toggle_container_show > div > a::attr('href')"):
            url = response.urljoin(href.extract())
            request = scrapy.Request(url, callback=self.parse_agenda_contents)
            request.meta['item'] = item         
            yield request

def parse_agenda_contents(self, response):
    for sel in response.xpath('//div[@class="container"]'):
        item = response.meta['item']
        item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
        item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
        yield item

jacquesseite · Accepted Answer

This worked :

class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]

def parse(self, response):
StartTimes = response.xpath('//div[@class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[@class="toggle_container_show"]/div/a/@href').extract()

for StartTime,url in zip(StartTimes,urls):
    item = LuItem()
    item['StartTime'] = StartTime
    request = Request(url,callback = self.parse_agenda_contents)
    request.meta['item'] = item
    yield request

def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
    item = response.meta['item']
    item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
    item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
    yield item

Extract data from two pages with Scrapy

Answers (2)

Related Questions