Reputation: 2141
I'm trying to scrape this site: https://case.occ.ok.gov/ords/f?p=1004:203
The missing piece of the puzzle is figuring out how to "get" the p_request
parameter in the data payload prior to making the final request. This field comes up empty when looking at the "main" page, so cannot use that to pass through to my POST request.
The code below doesn't work because I have a blank p_request
parameter in the payload, although I know through testing with developer console that it will work if I am able to get the p_request
field.
# Query Main Site to Build Payload
url = 'https://case.occ.ok.gov/ords/f?p=1004:203'
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
# Get Cookie
cookies = {}
cookdat = r.cookies
cookies['ORA_WWV_APP_1004'] = cookdat.get('ORA_WWV_APP_1004')
cookies['X-Oracle-BMC-LBS-Route'] = cookdat.get('X-Oracle-BMC-LBS-Route')
# Create Payload
inputs = soup.select('input')
d_inputs = {i['id']:i.get('value','') for i in inputs}
data = [
('p_flow_id', '1004'),
('p_flow_step_id', '203'),
('p_instance', '%s'%d_inputs['pInstance']),
('p_debug', ''),
('p_request', ''),
('p_widget_name', 'worksheet'),
('p_widget_mod', 'PULL'),
('p_widget_action', ''),
('p_widget_num_return', '100000'),
('x01', '8980043036046866'),
('x02', '8985720770049096'),
('f01', 'R8980010866046866_column_search_current_column'),
('f01', 'R8980010866046866_search_field'),
('f01', 'R8980010866046866_row_select'),
('f02', ''),
('f02', ''),
('f02', '50'),
('p_json', '{"pageItems":{"itemsToSubmit":[{"n":"P203_LASTNAME","v":"%s"},{"n":"P203_FIRSTNAME","v":""},{"n":"P203_SEARCH_CRITERIA","v":"1"}],"protected":"%s","rowVersion":"","formRegionChecksums":[]},"salt":"%s"}'%(letter,d_inputs['pPageItemsProtected'],d_inputs['pSalt'])),
]
# POST request retrieve data
r = requests.post('https://case.occ.ok.gov/ords/wwv_flow.ajax', cookies=cookies, data=data)
print(r.text)
In developer console, I see this field appears when making the type of submission I want, even though it is blank in the main page:
How do I "retrieve" this field, which is necessary for the request to work?
Upvotes: 2
Views: 1455
Reputation: 764
That work for me
import requests
import json
from bs4 import BeautifulSoup
# globals
users = []
letter = "A"
# session
session = requests.Session()
# get page
auth = session.get('https://case.occ.ok.gov/ords/f?p=1004:203')
soup = BeautifulSoup(auth.text, 'html.parser')
inputs = soup.select('input')
d_inputs = {i['id']: i.get('value', '') for i in inputs}
# create params
params = {
'p_flow_id': d_inputs['pFlowId'],
'p_flow_step_id': d_inputs['pFlowStepId'],
'p_instance': d_inputs['pInstance'],
'p_debug': '',
'p_request': 'Search',
'p_reload_on_submit': d_inputs['pReloadOnSubmit'],
'p_page_submission_id': d_inputs['pPageSubmissionId'],
'p_json': json.dumps({"pageItems": {
"itemsToSubmit": [
{"n": "P203_LASTNAME", "v": "{}".format(letter)},
{"n": "P203_FIRSTNAME", "v": ""},
{"n": "P203_SEARCH_CRITERIA", "v": "1"}
],
"protected": d_inputs['pPageItemsProtected'],
"rowVersion": "",
"formRegionChecksums": []
},
"salt": d_inputs['pSalt']
})
}
# Send request to APEX
session.post(
'https://case.occ.ok.gov/ords/wwv_flow.accept', data=params
)
# get page with data (first)
data_page = session.get(
'https://case.occ.ok.gov/ords/f?p=1004:203:{}::NO:::'.format(
d_inputs['pInstance']
)
)
table_soup = BeautifulSoup(data_page.text, 'html.parser')
# new params
inputs = table_soup.select('input')
d_inputs = {i['id']: i.get('value', '') for i in inputs}
json_ajax_data = json.loads(data_page.text.split(
'interactiveReport('
)[1].split(');})();')[0])
# get data for next pages
params_news = {
'p_flow_id': params['p_flow_id'],
'p_flow_step_id': params['p_flow_step_id'],
'p_instance': params['p_instance'],
'p_debug': '',
'p_request': 'PLUGIN={}'.format(json_ajax_data['ajaxIdentifier']),
'p_widget_name': 'worksheet',
'p_widget_mod': 'ACTION',
'p_widget_action': 'PAGE',
'p_widget_action_mod': 'pgR_min_row=51max_rows=50rows_fetched=50',
'p_widget_num_return': 50,
'x01': d_inputs['R8980010866046866_worksheet_id'],
'x02': d_inputs['R8980010866046866_worksheet_id'],
'p_json': params['p_json']
}
# get next page data
next_page = session.post(
'https://case.occ.ok.gov/ords/wwv_flow.ajax', data=params_news
)
next_page_soup = BeautifulSoup(next_page.text, 'html.parser')
next_page_table_with_data = table_soup.find('table', {'class': 'a-IRR-table'})
next_page_rows = next_page_table_with_data.find_all('tr')
# parse rows
for row_next_page in next_page_rows:
cells_next_page = row_next_page.find_all('td')
if len(cells_next_page) > 0:
users.append(
{
'name': cells_next_page[0].text, 'surname': cells_next_page[1].text
}
)
print(users)
[
{'name': 'ANDERSON', 'surname': 'MICHAEL L AND KAREN'},
{'name': 'ALVAREZ', 'surname': 'PETRA'},
...
]
Upvotes: 5