scrapy, how to parse AJAX response from asp.net page on POST

Question

I want to look through the companies at: https://www.greg.gg/webCompSearch.aspx

I know that the asp.net form needs certain parameters that can be extracted. When sending a POST in scrapy as FormRequest I also get a response with the additional data. My problem is that it is only partially html, see:

1|#||4|1890|updatePanel|ctl00_updPanel|

So the question is how I could parse the HTML properly.

Here is the minimal scrapy spider as reference:

# -*- coding: utf-8 -*-

import scrapy

class GgTestSpider(scrapy.Spider):
    name = 'gg_test'
    allowed_domains = ['www.greg.gg']
    base_url = 'https://www.greg.gg/webCompSearch.aspx'
    start_urls = [base_url]
    custom_settings = {
        'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
    }

    def parse(self, response):
        # grep ASP.NET elements out of response
        EVENTVALIDATION = response.xpath(
            '//*[@id="__EVENTVALIDATION"]/@value').extract_first()
        VIEWSTATE = response.xpath(
            '//*[@id="__VIEWSTATE"]/@value').extract_first()
        PREVIOUSPAGE = response.xpath(
            '//*[@id="__PREVIOUSPAGE"]/@value').extract_first()
        response.meta['fdat'] = {
            '__EVENTTARGET': '',
            '__EVENTARGUMENT': '',
            '__VIEWSTATE': VIEWSTATE,
            '__PREVIOUSPAGE': PREVIOUSPAGE,
            '__EVENTVALIDATION': EVENTVALIDATION,
            '__ASYNCPOST': "true",
            'ctl00$ScriptManager2': "ctl00$cntPortal$updPanel|ctl00$cntPortal$btnSearch",
            'ctl00$cntPortal$radSearchType': "radStartsWith",
            'ctl00$cntPortal$chkPrevNames': "on",
            'ctl00$cntPortal$ddlRegister': "0",
            'ctl00$cntPortal$btnSearch': "Search"
        }

        # id to search
        response.meta['fdat']['ctl00$cntPortal$txtCompRegNum'] = "1"

        return scrapy.FormRequest.from_response(
            response,
            headers={
                'Referer': self.base_url,
                'X-MicrosoftAjax': 'Delta=true',
            },
            formdata=response.meta['fdat'],
            meta={'fdat': response.meta['fdat']},
            callback=self._parse_items,
        )

    def _parse_items(self, response):

        company_item = response.xpath(
            '//input[contains(@id, "ctl00$cntPortal$grdSearchResults$ctl")]/@value').extract()

        print "no data:", response.request.headers, response.meta['fdat'], company_item, response.xpath('/')
        response.meta['fdat']['__EVENTVALIDATION'] = response.xpath(
            '//*[@id="__EVENTVALIDATION"]/@value').extract()
        response.meta['fdat']['__VIEWSTATE'] = response.xpath('//*[@id="__VIEWSTATE"]/@value').extract()
        response.meta['fdat']['__PREVIOUSPAGE'] = response.xpath(
            '//*[@id="__PREVIOUSPAGE"]/@value').extract()

        # give as input to form (POST) to get redirected
        for i in company_item:
            response.meta['fdat']['ctl00$ScriptManager2'] = 'ctl00$cntPortal$updPanel|{0}'.format(i)
            yield scrapy.FormRequest(
                url=self.base_url,
                formdata=response.meta['fdat'],
                meta={'company_extra_id': response.meta['company_extra_id']},
                callback=self._parse_company,
            )

    def _parse_company(self, response):
        pass

Thanks in advance!

EDIT: I changed the title of the question from how to get the full HTML like displayed in the browser to how to actually parse the partial HTML that is returned by the POST.

MrPandav · Accepted Answer

Using selectors

response_data = scrapy.Selector(text=response.body)
# this will give you selector object 
# you should be able to use .xpath and .css on response_data

scrapy, how to parse AJAX response from asp.net page on POST

Answers (1)

Related Questions