Scraping content of ASP.NET based website (https://www.proxymonitor.org/) using Scrapy

Question

I'm trying to scrapy 2022 results from proxymonitor.org its ASP.NET based. I've extracted all hidden variables on the website and sending them in FormRequest. But I'm still receiving empty table from the server. Any Idea what I'm missing?

Here is my code:

    from requests import request
    import scrapy
    from scrapy.http import FormRequest
    
    
    class ProxyMonitorSpiderSpider(scrapy.Spider):
        name = 'proxy_monitor_spider'
    
        allowed_domains = ['proxymonitor.org']
        start_urls = [
            'https://www.proxymonitor.org'
        ]
    
        def parse(self, response):
    
            formdata = {
                # response.css('input#__EVENTTARGET::attr(value)').extract_first(),
                '__EVENTTARGET': '',
                '__EVENTARGUMENT': '',
                '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(),
                '__VIEWSTATEGENERATOR': response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first(),
                '__PREVIOUSPAGE': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
                '__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
                # 'ctl00_ContentPlaceHolder1_ctrlQuickSearch1_cmbYr1': '2022',
    
    
                # 'DXScript': response.css('input#__DXScript::attr(value)').extract_first(),
    
                # '__CALLBACKID': 'ctl00$ContentPlaceHolder1$ctrlQuickSearch1$CountCallback',
                # '__CALLBACKPARAM': 'c0:',
    
                'ctl00_ContentPlaceHolder1_ctrlQuickSearch1_cmbYr1_VI': '2022',
                'ctl00_ContentPlaceHolder1_ctrlQuickSearch1_cmbYr2_VI': '2022',
    
                'ctl00$ContentPlaceHolder1$ctrlQuickSearch1$cmbYr1': '2022',
                'ctl00$ContentPlaceHolder1$ctrlQuickSearch1$cmbYr1$DDD$L': '2022',
    
    
                'ctl00$ContentPlaceHolder1$ctrlQuickSearch1$cmbYr2': '2022',
                'ctl00$ContentPlaceHolder1$ctrlQuickSearch1$cmbYr2$DDD$L': '2022',
            }
            # print('*** form data',formdata)
            req = scrapy.FormRequest.from_response(
                response, url='https://www.proxymonitor.org/Results.aspx', formdata=formdata, callback=self.parse2)
    
            yield req
    
        def parse2(self, response):
            print('*** status:', response.status)
            with open('response2.html', 'w') as html_file:
                html_file.write(response.text)
    
            for row in response.xpath('//*[@class="dxgvTable_Office2010Silver"]//tbody//tr[position() = 2]'):
                yield {
                    'resolution_name': row.xpath('td[2]//text()').extract_first(),
                    'agm_date': row.xpath('td[3]//text()').extract_first(),
                    'company': row.xpath('td[4]//text()').extract_first(),
                    'lead_filer': row.xpath('td[5]//text()').extract_first(),
                    'status': row.xpath('td[6]//text()').extract_first(),
                }

Scraping content of ASP.NET based website (https://www.proxymonitor.org/) using Scrapy

Answers (1)

Related Questions