Reputation: 375
from string import join
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.http.request import Request
from scrapy.selector import HtmlXPathSelector
from Gfire.items import GfireItem
class GuideSpider(CrawlSpider):
name = "Gfire"
allowed_domains = ['www.example.com']
start_urls = [
"http://www.example.com/gfire/guides"
]
rules = (
Rule(SgmlLinkExtractor(allow=("gfire/guides.*page=")), callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[@class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/@href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1], callback=self.parse_item2)
def parse_item2(self, response):
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero
Can't get this spider to work. The request function contains items[1] that should be item['guide_url'] but it says me that the parameter has to be str or unicode. How can I corret this error? And how can I pass to the callback function the items list? Via request.meta?
Upvotes: 1
Views: 10229
Reputation:
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select('//div[@class="title"]')
for site in sites:
item = GFireItem()
item['title'] = site.select('./a/text()').extract()
item['guide_url'] = site.select('./a/@href').extract()
item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
items.append(item)
return Request(items[1]['guide_url'], request.meta={'items':items}, callback=self.parse_item2)
def parse_item2(self, response):
items = response.meta["items"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//h3/a/text()").extract()
return hero
Upvotes: 4
Reputation: 1125058
Your item[1]
is actually an instance of GFireItem
.
I'm not certain why you are creating these as you only use one (the second site in your list of sites), discarding the rest of the list.
That aside, you need to extract the items[1]['guide_url']
url when creating the Request
:
return Request(items[1]['guide_url'], callback=self.parse_item2)
Upvotes: 4