Reputation: 570
This code works perfectly fine when I pass extract()[0]
or extract()
- it gives me output for the first link it parsed.I am not able to understand why its doing so,bcs when I was crawling Other websites with this code it was perfectly fine.
With this website its scraping only the first link.If I change extract()[1]
then it will give me second link and so on .Why its not working automatically in for loop?
import scrapy
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(BaseSpider):
name = "mmt_mouth"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/websites/makemytripcom-reviews-925031929"]
# rules = (
# Rule(
# SgmlLinkExtractor(allow=("search=make-my-trip&page=1/+",)),
# callback="parse",
# follow=True),
# )
def parse(self, response):
sites = response.xpath('//div[@id="allreviews"]')
items = []
for site in sites:
item = CompItem()
item['name'] = site.xpath('.//li[@class="profile"]/div/a/span/text()').extract()[0]
item['title'] = site.xpath('.//div[@class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = site.xpath('.//div[@class="reviewrate"]//span[@class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = site.xpath('.//div[@class="reviewtitle fl"]/strong/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[@itemprop="description"]/p/text()').extract()
yield old_item
Upvotes: 0
Views: 634
Reputation: 2594
Because your for loop has nothing to loop on the given website. Change your statement
sites = response.xpath('//div[@id="allreviews"]')
to
sites = response.xpath('//div[@id="allreviews"]/ul/li')
Then your for loop can loop over the list elements.
Upvotes: 3