Reputation: 155
I'm attempting to scrape some data from this site: https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx
I am able to get the first 11 pages using my method but for some reason it quits beyond the 11th page. I've read the other posts related to .aspx and haven't seen anything that applies to my situation.
I'm new to this so my code is a little verbose but it gets the job done--somewhat. I've played with adjusting the headers and a bunch of other stuff but cannot get past the 11th page. Makes no sense to me.
I'm fairly certain the problem lies in the viewstate and viewgenerator header parameters. I'm not sure how to get these for the page you want to go to in the loop. I'm pretty much using the same values for all pages. For some reason, this approach works up to and including page 11 then it breaks. This is odd since it looks like each page has a different viewstate value.
Thanks in advance.
import pandas as pd
import re
import pandas as pd
import numpy as np
import urllib
from requests import Session
from bs4 import BeautifulSoup
import time
import requests
# List of pages to loop over
page_list = ['Page$1','Page$2','Page$3','Page$4','Page$5','Page$6','Page$7','Page$8','Page$9','Page$10',
'Page$11','Page$12','Page$13','Page$14','Page$15','Page$16','Page$17','Page$18','Page$19','Page$20']
wa_url = 'https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx'
# Getting header elements from url
session = requests.Session()
session.headers.update({
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
})
val_get = session.get(wa_url)
soup = BeautifulSoup(val_get.content, "html.parser")
tags = soup.find_all('input')
# Header elements I need for the POST request
view_state = tags[3]['value']
view_generator = tags[4]['value']
evnt_validation = tags[6]['value']
no_emps = []
date = []
#Looping through pages of WARN database
for page in page_list:
data = {
# Form data header stuff
"__EVENTTARGET": "ucPSW$gvMain",
"__EVENTARGUMENT": page,
"__LASTFOCUS": "",
"__VIEWSTATE": view_state,
"__VIEWSTATEGENERATOR": view_generator,
"__VIEWSTATEENCRYPTED": "",
"__EVENTVALIDATION": evnt_validation,
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"en-US,en;q=0.9",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Content-Type":"application/x-www-form-urlencoded",
"Cookie":"_ga=GA1.2.1011893740.1592948578; _gid=GA1.2.1433455844.1592948578",
"Host":"fortress.wa.gov",
"Origin":"https://fortress.wa.gov",
"Referer":"https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx",
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"same-origin",
"Sec-Fetch-User":"?1",
"Upgrade-Insecure-Requests":"1"
}
# Getting data from each page
session = requests.Session()
session.headers.update({
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
})
get_warn_data = session.post(wa_url, data=data)
soup = BeautifulSoup(get_warn_data.content, "html.parser")
# Getting all rows of data and desired table data after some cleaning up
work = soup.find_all('tr')
work = [a.get_text('@') for a in work]
work = [re.sub(r'\n', '', a) for a in work]
work = [re.sub(r'^@|@$', '', a) for a in work]
work = [a.split('@') for a in work]
work = [a for a in work if len(a) == 7]
no_emps_u = [a[3] for a in work]
date_use = [a[6] for a in work]
no_emps.append(no_emps_u)
date.append(date_use)
# Dynamically Updating header values with stuff in current html
# Only applicable for page2 and on
if page != 'Page$1':
tags = soup.find_all('input')
view_state = tags[3]['value']
view_generator = tags[4]['value']
evnt_validation = tags[6]['value']
else:
pass
# Wrapping up results into lists
from pandas.core.common import flatten
WA_WARN_no_emps = list(flatten(no_emps))
WA_WARN_date = list(flatten(date))
Update<<<: Following Andrej's advice in the comments, I added the if statement at the end of the for loop that updates the header values; this addition fixed the code and gets all the pages in page_list.
Upvotes: 2
Views: 581
Reputation: 195408
You can use this example to get all pages (total 67) from the site (it gets all <input>
values dynamically - so it gets correct __VIEWSTATE
etc.):
import requests
from bs4 import BeautifulSoup
url = 'https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def get_data(soup, page_num):
data = {}
for i in soup.select('input'):
data[i['name']] = i.get('value', '')
del data['ucPSW$btnSearchCompany']
data['__EVENTTARGET'] = 'ucPSW$gvMain'
data['__EVENTARGUMENT'] = 'Page${}'.format(page_num)
data['__LASTFOCUS'] = ''
return data
page = 1
while True:
print('Page {}...'.format(page))
total = 1
for total, tr in enumerate(soup.select('#ucPSW_gvMain > tr:not(:has(table)):has(td)'), 1):
tds = [td.get_text(strip=True) for td in tr.select('td')]
print('{:<3}{:<50}{:<25}{:<15}{:<15}{:<15}{:<15}{:<15}'.format(total, *tds))
if total % 15:
break
page += 1
soup = BeautifulSoup( requests.post(url, get_data(soup, page)).content, 'html.parser' )
Prints:
Page 1...
1 Safran Cabin Materials, LLC Marysville and Newport 6/23/2020 85 Layoff Permanent 6/24/2020
2 Swissport Fueling SeaTac 5/8/2020 69 Layoff Permanent 6/19/2020
3 Swissport USA, Inc SeaTac 5/22/2020 62 Layoff Permanent 6/19/2020
4 Swissport USA, Inc SeaTac 3/20/2020 167 Layoff Temporary 6/19/2020
5 Tool Gauge and Machine Works Tacoma 6/17/2020 59 Layoff Permanent 6/18/2020
6 Hyatt Corporation Motif Seattle Seattle 3/14/2020 91 Layoff Temporary 6/18/2020
7 Jacobsen Daniel's Enterprise, Inc Tacoma 6/12/2020 1 Layoff Permanent 6/18/2020
8 Benchmark Stevenson, LLC d/b/a Skamania Lodge Stevenson 3/18/2020 185 Layoff Temporary 6/17/2020
9 Seattle Art Museum Seattle 7/5/2020 76 Layoff Temporary 6/16/2020
10 Chihuly Garden & Glass Seattle 3/21/2020 97 Layoff Temporary 6/16/2020
11 Seattle Center Seattle 3/21/2020 182 Layoff Temporary 6/16/2020
12 Sekisui Aerospace Renton and Sumner 6/12/2020 111 Layoff Permanent 6/15/2020
13 Pioneer Human Services Seattle 8/14/2020 59 Layoff Permanent 6/15/2020
14 Crista Senior Living Shoreline 8/16/2020 156 Closure Permanent 6/15/2020
15 Hyatt Corporation / Hyatt Regency Bellevue Bellevue 3/15/2020 223 Layoff Temporary 6/15/2020
Page 2...
1 Toray Composite Materials America, Inc Tacoma 8/8/2020 146 Layoff Permanent 6/12/2020
2 Embassy Suites Seattle Bellevue Seattle 6/1/2020 57 Layoff Temporary 6/12/2020
3 Triumph Aerospace Structures Spokane 6/15/2020 12 Layoff Permanent 6/11/2020
4 Hyatt Corporation / Hyatt Regency Lake Washington Renton 6/30/2020 129 Layoff Temporary 6/9/2020
5 Lamb Weston, Inc Connell, WA 6/15/2020 360 Layoff Temporary 6/8/2020
6 Lamb Weston, Inc Warden 6/15/2020 300 Layoff Temporary 6/8/2020
... and so on.
Upvotes: 1