Reputation: 581
i am practicing scrapy on a website with nested pages, and i only need to scrape the innermost page's contents but is there a way to carry the data from the parse function to parse the innermost page, to the main parse function, using many parse functions to open pages but only get items from the last parse function, and carry over to the main parse function
here is what i have tried
try:
import scrapy
from urlparse import urljoin
except ImportError:
print "\nERROR IMPORTING THE NESSASARY LIBRARIES\n"
class CanadaSpider(scrapy.Spider):
name = 'CananaSpider'
start_urls = ['http://www.canada411.ca']
#PAGE 1 OF THE NESTED WEBSITE GETTING LINK AND JOING WITH THE MAIN LINK AND VISITING THE PAGE
def parse(self, response):
SET_SELECTOR = '.c411AlphaLinks.c411NoPrint ul li'
for PHONE in response.css(SET_SELECTOR):
selector = 'a ::attr(href)'
try:
momo = urljoin('http://www.canada411.ca', PHONE.css(selector).extract_first())
#PASSING A DICTIONARYAS THE ITEM
pre = {}
post = scrapy.Request(momo, callback=self.parse_pre1, meta={'item': pre})
yield pre
except:
pass
#PAGE 2 OF THE NESTED WEBSITE
def parse_pre1(self, response):
#RETURNING THE SAME ITEM
item = response.meta["item"]
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for PHONE in response.css(SET_SELECTOR):
selector = 'a ::attr(href)'
momo = urljoin('http://www.canada411.ca', PHONE.css(selector).extract_first())
pre = scrapy.Request(momo, callback=self.parse_pre1, meta={'page_2': item})
yield pre
def parse_info(self, response):
#HERE I AM SCRAPING THE DATA
item = response.meta["page_2"]
name = '.vcard__name'
address = '.c411Address.vcard__address'
ph = '.vcard.label'
item['name'] = response.css(name).extract_first()
item['address'] = response.css(address).extract_first()
item['phoneno'] = response.css(ph).extract_first()
return item
i am inheriting the item what am i doing wrong ?
Upvotes: 0
Views: 273
Reputation: 1981
In parse
your are yielding pre
in instance of post
, also you should use Scrapy.Item
classes, not a dict.
def parse(self, response):
SET_SELECTOR = '.c411AlphaLinks.c411NoPrint ul li'
for PHONE in response.css(SET_SELECTOR):
selector = 'a ::attr(href)'
try:
momo = urljoin('http://www.canada411.ca', PHONE.css(selector).extract_first())
#PASSING A DICTIONARYAS THE ITEM
pre = {} # This should be an instance of Scrapy.Item
post = scrapy.Request(momo, callback=self.parse_pre1, meta={'item': pre})
yield post
except:
pass
And in parse_pre1
you set as callback parse_pre1
again, I think that you mean parse_info
def parse_pre1(self, response):
#RETURNING THE SAME ITEM
item = response.meta["item"]
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for PHONE in response.css(SET_SELECTOR):
selector = 'a ::attr(href)'
momo = urljoin('http://www.canada411.ca', PHONE.css(selector).extract_first())
pre = scrapy.Request(momo, callback=self.parse_info, meta={'page_2': item})
yield pre
Upvotes: 2