wolf7687
wolf7687

Reputation: 155

Web scraping from .aspx site using python

I'm attempting to scrape some data from this site: https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx

I am able to get the first 11 pages using my method but for some reason it quits beyond the 11th page. I've read the other posts related to .aspx and haven't seen anything that applies to my situation.

I'm new to this so my code is a little verbose but it gets the job done--somewhat. I've played with adjusting the headers and a bunch of other stuff but cannot get past the 11th page. Makes no sense to me.

I'm fairly certain the problem lies in the viewstate and viewgenerator header parameters. I'm not sure how to get these for the page you want to go to in the loop. I'm pretty much using the same values for all pages. For some reason, this approach works up to and including page 11 then it breaks. This is odd since it looks like each page has a different viewstate value.

Thanks in advance.

import pandas as pd
import re
import pandas as pd
import numpy as np
import urllib
from requests import Session
from bs4 import BeautifulSoup
import time
import requests


# List of pages to loop over
page_list = ['Page$1','Page$2','Page$3','Page$4','Page$5','Page$6','Page$7','Page$8','Page$9','Page$10',
             'Page$11','Page$12','Page$13','Page$14','Page$15','Page$16','Page$17','Page$18','Page$19','Page$20']
wa_url = 'https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx'

# Getting header elements from url
session = requests.Session()
session.headers.update({
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
})
val_get = session.get(wa_url)
soup = BeautifulSoup(val_get.content, "html.parser")

tags = soup.find_all('input')
# Header elements I need for the POST request
view_state = tags[3]['value']
view_generator = tags[4]['value']
evnt_validation = tags[6]['value']



no_emps = []
date = []

#Looping through pages of WARN database
for page in page_list:
    
    data = {
    # Form data header stuff
    "__EVENTTARGET": "ucPSW$gvMain",
    "__EVENTARGUMENT": page,
    "__LASTFOCUS": "",
    "__VIEWSTATE": view_state,
    "__VIEWSTATEGENERATOR": view_generator,
    "__VIEWSTATEENCRYPTED": "",
    "__EVENTVALIDATION": evnt_validation,
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"en-US,en;q=0.9",
    "Cache-Control":"max-age=0",
    "Connection":"keep-alive",
    "Content-Type":"application/x-www-form-urlencoded",
    "Cookie":"_ga=GA1.2.1011893740.1592948578; _gid=GA1.2.1433455844.1592948578",
    "Host":"fortress.wa.gov",
    "Origin":"https://fortress.wa.gov",
    "Referer":"https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx",
    "Sec-Fetch-Dest":"document",
    "Sec-Fetch-Mode":"navigate",
    "Sec-Fetch-Site":"same-origin",
    "Sec-Fetch-User":"?1",
    "Upgrade-Insecure-Requests":"1"
    }
    
    # Getting data from each page
    session = requests.Session()
    session.headers.update({
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    })
    
    get_warn_data = session.post(wa_url, data=data)
    soup = BeautifulSoup(get_warn_data.content, "html.parser")
    
    # Getting all rows of data and desired table data after some cleaning up
    work = soup.find_all('tr')
    work = [a.get_text('@') for a in work]
    work = [re.sub(r'\n', '', a) for a in work]
    work = [re.sub(r'^@|@$', '', a) for a in work]
    work = [a.split('@') for a in work]
    
        
    work = [a for a in work if len(a) == 7]
    no_emps_u = [a[3] for a in work]
    date_use = [a[6] for a in work]
    
    no_emps.append(no_emps_u)
    date.append(date_use)
    
# Dynamically Updating header values with stuff in current html
# Only applicable for page2 and on
if page != 'Page$1':
    tags = soup.find_all('input')
    view_state = tags[3]['value']
    view_generator = tags[4]['value']
    evnt_validation = tags[6]['value']
else:
    pass
    
# Wrapping up results into lists
from pandas.core.common import flatten
WA_WARN_no_emps = list(flatten(no_emps))
WA_WARN_date = list(flatten(date))

Update<<<: Following Andrej's advice in the comments, I added the if statement at the end of the for loop that updates the header values; this addition fixed the code and gets all the pages in page_list.

Upvotes: 2

Views: 581

Answers (1)

Andrej Kesely
Andrej Kesely

Reputation: 195408

You can use this example to get all pages (total 67) from the site (it gets all <input> values dynamically - so it gets correct __VIEWSTATE etc.):

import requests
from bs4 import BeautifulSoup


url = 'https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

def get_data(soup, page_num):
    data = {}
    for i in soup.select('input'):
        data[i['name']] = i.get('value', '')
    del data['ucPSW$btnSearchCompany']
    data['__EVENTTARGET'] = 'ucPSW$gvMain'
    data['__EVENTARGUMENT'] = 'Page${}'.format(page_num)
    data['__LASTFOCUS'] = ''
    return data

page = 1
while True:
    print('Page {}...'.format(page))

    total = 1
    for total, tr in enumerate(soup.select('#ucPSW_gvMain > tr:not(:has(table)):has(td)'), 1):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        print('{:<3}{:<50}{:<25}{:<15}{:<15}{:<15}{:<15}{:<15}'.format(total, *tds))

    if total % 15:
        break

    page += 1
    soup = BeautifulSoup( requests.post(url, get_data(soup, page)).content, 'html.parser' )

Prints:

Page 1...
1  Safran Cabin Materials, LLC                       Marysville and Newport   6/23/2020      85             Layoff         Permanent      6/24/2020      
2  Swissport Fueling                                 SeaTac                   5/8/2020       69             Layoff         Permanent      6/19/2020      
3  Swissport USA, Inc                                SeaTac                   5/22/2020      62             Layoff         Permanent      6/19/2020      
4  Swissport USA, Inc                                SeaTac                   3/20/2020      167            Layoff         Temporary      6/19/2020      
5  Tool Gauge and Machine Works                      Tacoma                   6/17/2020      59             Layoff         Permanent      6/18/2020      
6  Hyatt Corporation Motif Seattle                   Seattle                  3/14/2020      91             Layoff         Temporary      6/18/2020      
7  Jacobsen Daniel's Enterprise, Inc                 Tacoma                   6/12/2020      1              Layoff         Permanent      6/18/2020      
8  Benchmark Stevenson, LLC d/b/a Skamania Lodge     Stevenson                3/18/2020      185            Layoff         Temporary      6/17/2020      
9  Seattle Art Museum                                Seattle                  7/5/2020       76             Layoff         Temporary      6/16/2020      
10 Chihuly Garden & Glass                            Seattle                  3/21/2020      97             Layoff         Temporary      6/16/2020      
11 Seattle Center                                    Seattle                  3/21/2020      182            Layoff         Temporary      6/16/2020      
12 Sekisui Aerospace                                 Renton and Sumner        6/12/2020      111            Layoff         Permanent      6/15/2020      
13 Pioneer Human Services                            Seattle                  8/14/2020      59             Layoff         Permanent      6/15/2020      
14 Crista Senior Living                              Shoreline                8/16/2020      156            Closure        Permanent      6/15/2020      
15 Hyatt Corporation / Hyatt Regency Bellevue        Bellevue                 3/15/2020      223            Layoff         Temporary      6/15/2020      
Page 2...
1  Toray Composite Materials America, Inc            Tacoma                   8/8/2020       146            Layoff         Permanent      6/12/2020      
2  Embassy Suites Seattle Bellevue                   Seattle                  6/1/2020       57             Layoff         Temporary      6/12/2020      
3  Triumph Aerospace Structures                      Spokane                  6/15/2020      12             Layoff         Permanent      6/11/2020      
4  Hyatt Corporation / Hyatt Regency Lake Washington Renton                   6/30/2020      129            Layoff         Temporary      6/9/2020       
5  Lamb Weston, Inc                                  Connell, WA              6/15/2020      360            Layoff         Temporary      6/8/2020       
6  Lamb Weston, Inc                                  Warden                   6/15/2020      300            Layoff         Temporary      6/8/2020       

... and so on.

Upvotes: 1

Related Questions