Python Scrapy nested pages only need items from innermost page

Question

i am practicing scrapy on a website with nested pages, and i only need to scrape the innermost page's contents but is there a way to carry the data from the parse function to parse the innermost page, to the main parse function, using many parse functions to open pages but only get items from the last parse function, and carry over to the main parse function

here is what i have tried

try:
    import scrapy
    from urlparse import urljoin

except ImportError:
    print "
ERROR IMPORTING THE NESSASARY LIBRARIES
"



class CanadaSpider(scrapy.Spider):
    name = 'CananaSpider'
    start_urls = ['http://www.canada411.ca']


    #PAGE 1 OF THE NESTED WEBSITE GETTING LINK AND JOING WITH THE MAIN LINK AND VISITING THE PAGE
    def parse(self, response):
        SET_SELECTOR = '.c411AlphaLinks.c411NoPrint ul li'
        for PHONE in response.css(SET_SELECTOR):
            selector = 'a ::attr(href)'
            try:
                momo = urljoin('http://www.canada411.ca', PHONE.css(selector).extract_first())

                #PASSING A DICTIONARYAS THE ITEM
                pre  = {}
                post = scrapy.Request(momo, callback=self.parse_pre1, meta={'item': pre})
                yield pre
            except:
                pass   

#PAGE 2 OF THE NESTED WEBSITE


    def parse_pre1(self, response):

        #RETURNING THE SAME ITEM 
        item = response.meta["item"]
        SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'

        for PHONE in response.css(SET_SELECTOR):
            selector = 'a ::attr(href)'
            momo = urljoin('http://www.canada411.ca', PHONE.css(selector).extract_first())
            pre = scrapy.Request(momo, callback=self.parse_pre1, meta={'page_2': item})
            yield pre

    def parse_info(self, response):

        #HERE I AM SCRAPING THE DATA
        item = response.meta["page_2"]
        name = '.vcard__name'
        address = '.c411Address.vcard__address'
        ph = '.vcard.label'

        item['name'] = response.css(name).extract_first()
        item['address'] = response.css(address).extract_first()
        item['phoneno'] = response.css(ph).extract_first()
        return item

i am inheriting the item what am i doing wrong ?

Python Scrapy nested pages only need items from innermost page

Answers (1)

Related Questions