Reputation: 452
First, I am trying to scrape the code of funds e.g MGB_U, JAS_U from: "http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"
Then, scrape the prices of each fund from for example:
"http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd="+"MGB_U"
"http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd="+"JAS_U"
My code have:
raise NotImplementedError
but I still don't know how to solve it.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem
class PruSpider(BaseSpider):
name = "prufunds"
allowed_domains = ["prudential.com.hk"]
start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]
items = []
for fund_U in funds_U:
url = "http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U
item = FundPriceItem()
item['fund'] = fund_U
item['data'] = hxs.select('//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
items.append(item)
return items
Upvotes: 1
Views: 357
Reputation: 473873
You should use scrapy's Request for each fund
in the loop:
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem
class PruSpider(BaseSpider):
name = "prufunds"
allowed_domains = ["prudential.com.hk"]
start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]
for fund_U in funds_U:
yield Request(
url="http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U,
callback=self.parse_fund,
meta={'fund': fund_U})
def parse_fund(self, response):
hxs = HtmlXPathSelector(response)
item = FundPriceItem()
item['fund'] = response.meta['fund']
item['data'] = hxs.select(
'//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
return item
Hope that helps.
Upvotes: 1