Newbie: How to scrape multiple web pages with only one start_urls?

Question

First, I am trying to scrape the code of funds e.g MGB_U, JAS_U from: "http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"

Then, scrape the prices of each fund from for example:

"http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd="+"MGB_U"

"http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd="+"JAS_U"

My code have: raise NotImplementedError but I still don't know how to solve it.

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem

class PruSpider(BaseSpider):
    name = "prufunds"
    allowed_domains = ["prudential.com.hk"]
    start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
        funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]

        items = []

        for fund_U in funds_U:
            url = "http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U
            item = FundPriceItem()
            item['fund'] = fund_U
            item['data'] =  hxs.select('//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
            items.append(item)
            return items

alecxe · Accepted Answer

You should use scrapy's Request for each fund in the loop:

from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem


class PruSpider(BaseSpider):
    name = "prufunds"
    allowed_domains = ["prudential.com.hk"]
    start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
        funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]

        for fund_U in funds_U:
            yield Request(
                url="http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U,
                callback=self.parse_fund,
                meta={'fund': fund_U})

    def parse_fund(self, response):
        hxs = HtmlXPathSelector(response)
        item = FundPriceItem()
        item['fund'] = response.meta['fund']
        item['data'] = hxs.select(
            '//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
        return item

Hope that helps.

Newbie: How to scrape multiple web pages with only one start_urls?

Answers (1)

Related Questions