Reputation: 93
I am scraping this webpage https://researchgrant.gov.sg/eservices/advanced-search/?keyword=&source=sharepoint&type=project&status=open&_pp_projectstatus=&_pp_hiname=&_pp_piname=&_pp_source=sharepoint&_pp_details=#project using Scrapy FormRequest. My code is as below. The parameter _pp_hiname
with ab
and _pp_piname
with pua
should only return 1 result in response.text
but instead it returns all the result in HTML code. Parameter apparently not working but I couldn't see any wrong with it.
def start_requests(self):
params = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
yield scrapy.FormRequest('https://researchgrant.gov.sg/eservices/mvcgrid',callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})
def parse_item(self,response):
print(response.text)
But apparently it is showing all entry:
Latest update:
class ToScrapeCSSSpiderSG(scrapy.Spider):
name = "toscrapesg-css"
# start_urls = [
# 'https://researchgrant.gov.sg/eservices/mvcgrid',
# ]
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': '',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
def start_requests(self):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url,callback=self.parse_item,method='POST',formdata=self.params,headers = {'X-Requested-With':'XMLHttpRequest'})
def parse_item(self,response):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
link=row.xpath('td[1]/a/@href').extract_first()
yield scrapy.FormRequest(link,callback = self.parse_product,method='GET')
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers = {'X-Requested-With': 'XMLHttpRequest'})
def parse_product(self,response):
text = response.xpath('//span[contains(@id,"ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle")]/text()').extract()
# text = info.xpath('./text()').extract()
print(text)
Upvotes: 4
Views: 640
Reputation: 142681
It sends in POST
body only Name=advancesearchawardedprojectsp
. Other parameters should be in url as query.
So url
should be
You can use urllib.parse.urlencode(args)
for this.
And it gives me one result.
import urllib.parse
def start_requests(self):
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
}
args = urllib.parse.urlencode(args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})
EDIT: example which loads next pages and check button Next Page
to stop.
EDIT: now it can save in csv
file.
import scrapy
import urllib.parse
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
params = {
'name': 'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
#'_pp_hiname': 'tan',
#'_pp_piname': '',
'_pp_hiname': 'ab',
'_pp_piname': '', #'pua',
'_pp_source': '',
'_pp_details': '',
}
def start_requests(self):
# create request for first page
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
def parse_item(self,response):
#print('parse_item] url:', response.url)
#print('parse_item] text:', response.text)
#for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
# for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
# link = row.xpath('td[1]/a/@href').extract_first()
# yield scrapy.Request(link, callback=self.parse_product)
for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
link = row.xpath('.//a/@href').get()
#title = row.xpath('.//a/text()').get()
yield scrapy.Request(link, callback=self.parse_product)
# create request for next page
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
# next page
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
def parse_product(self, response):
#print('parse_product] url:', response.url)
# .extract_first() or .get() instead of .extract()
project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get()
title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get()
pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get()
hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get()
date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get()
# etc.
item = {
'id': project_id,
'title': title,
'pi': pi,
'hi': hi,
'date': date,
}
yield item
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
Upvotes: 5