Reputation: 265
I want to look through the companies at: https://www.greg.gg/webCompSearch.aspx
I know that the asp.net form needs certain parameters that can be extracted. When sending a POST in scrapy as FormRequest
I also get a response with the additional data. My problem is that it is only partially html, see:
1|#||4|1890|updatePanel|ctl00_updPanel|
<br />
<div id="login">
<div id="ctl00_pnlLogin" onkeypress="javascript:return WebForm_FireDefaultButton(event, 'ctl00_btnLogin')">
So the question is how I could parse the HTML properly.
Here is the minimal scrapy spider as reference:
# -*- coding: utf-8 -*-
import scrapy
class GgTestSpider(scrapy.Spider):
name = 'gg_test'
allowed_domains = ['www.greg.gg']
base_url = 'https://www.greg.gg/webCompSearch.aspx'
start_urls = [base_url]
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def parse(self, response):
# grep ASP.NET elements out of response
EVENTVALIDATION = response.xpath(
'//*[@id="__EVENTVALIDATION"]/@value').extract_first()
VIEWSTATE = response.xpath(
'//*[@id="__VIEWSTATE"]/@value').extract_first()
PREVIOUSPAGE = response.xpath(
'//*[@id="__PREVIOUSPAGE"]/@value').extract_first()
response.meta['fdat'] = {
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': VIEWSTATE,
'__PREVIOUSPAGE': PREVIOUSPAGE,
'__EVENTVALIDATION': EVENTVALIDATION,
'__ASYNCPOST': "true",
'ctl00$ScriptManager2': "ctl00$cntPortal$updPanel|ctl00$cntPortal$btnSearch",
'ctl00$cntPortal$radSearchType': "radStartsWith",
'ctl00$cntPortal$chkPrevNames': "on",
'ctl00$cntPortal$ddlRegister': "0",
'ctl00$cntPortal$btnSearch': "Search"
}
# id to search
response.meta['fdat']['ctl00$cntPortal$txtCompRegNum'] = "1"
return scrapy.FormRequest.from_response(
response,
headers={
'Referer': self.base_url,
'X-MicrosoftAjax': 'Delta=true',
},
formdata=response.meta['fdat'],
meta={'fdat': response.meta['fdat']},
callback=self._parse_items,
)
def _parse_items(self, response):
company_item = response.xpath(
'//input[contains(@id, "ctl00$cntPortal$grdSearchResults$ctl")]/@value').extract()
print "no data:", response.request.headers, response.meta['fdat'], company_item, response.xpath('/')
response.meta['fdat']['__EVENTVALIDATION'] = response.xpath(
'//*[@id="__EVENTVALIDATION"]/@value').extract()
response.meta['fdat']['__VIEWSTATE'] = response.xpath('//*[@id="__VIEWSTATE"]/@value').extract()
response.meta['fdat']['__PREVIOUSPAGE'] = response.xpath(
'//*[@id="__PREVIOUSPAGE"]/@value').extract()
# give as input to form (POST) to get redirected
for i in company_item:
response.meta['fdat']['ctl00$ScriptManager2'] = 'ctl00$cntPortal$updPanel|{0}'.format(i)
yield scrapy.FormRequest(
url=self.base_url,
formdata=response.meta['fdat'],
meta={'company_extra_id': response.meta['company_extra_id']},
callback=self._parse_company,
)
def _parse_company(self, response):
pass
Thanks in advance!
EDIT: I changed the title of the question from how to get the full HTML like displayed in the browser to how to actually parse the partial HTML that is returned by the POST.
Upvotes: 1
Views: 796
Reputation: 1861
response_data = scrapy.Selector(text=response.body)
# this will give you selector object
# you should be able to use .xpath and .css on response_data
Upvotes: 1