David Yang
David Yang

Reputation: 2141

Python Requests: How to get value of Blank Hidden Input

I'm trying to scrape this site: https://case.occ.ok.gov/ords/f?p=1004:203

The missing piece of the puzzle is figuring out how to "get" the p_request parameter in the data payload prior to making the final request. This field comes up empty when looking at the "main" page, so cannot use that to pass through to my POST request.

The code below doesn't work because I have a blank p_request parameter in the payload, although I know through testing with developer console that it will work if I am able to get the p_request field.

# Query Main Site to Build Payload
url = 'https://case.occ.ok.gov/ords/f?p=1004:203'
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')

# Get Cookie
cookies = {}
cookdat = r.cookies
cookies['ORA_WWV_APP_1004'] = cookdat.get('ORA_WWV_APP_1004')
cookies['X-Oracle-BMC-LBS-Route'] = cookdat.get('X-Oracle-BMC-LBS-Route')

# Create Payload
inputs = soup.select('input')
d_inputs = {i['id']:i.get('value','') for i in inputs}

data = [
  ('p_flow_id', '1004'),
  ('p_flow_step_id', '203'),
  ('p_instance', '%s'%d_inputs['pInstance']),
  ('p_debug', ''),
  ('p_request', ''),
  ('p_widget_name', 'worksheet'),
  ('p_widget_mod', 'PULL'),
  ('p_widget_action', ''),
  ('p_widget_num_return', '100000'),
  ('x01', '8980043036046866'),
  ('x02', '8985720770049096'),
  ('f01', 'R8980010866046866_column_search_current_column'),
  ('f01', 'R8980010866046866_search_field'),
  ('f01', 'R8980010866046866_row_select'),
  ('f02', ''),
  ('f02', ''),
  ('f02', '50'),
  ('p_json', '{"pageItems":{"itemsToSubmit":[{"n":"P203_LASTNAME","v":"%s"},{"n":"P203_FIRSTNAME","v":""},{"n":"P203_SEARCH_CRITERIA","v":"1"}],"protected":"%s","rowVersion":"","formRegionChecksums":[]},"salt":"%s"}'%(letter,d_inputs['pPageItemsProtected'],d_inputs['pSalt'])),
]

# POST request retrieve data
r = requests.post('https://case.occ.ok.gov/ords/wwv_flow.ajax', cookies=cookies, data=data)
print(r.text)

In developer console, I see this field appears when making the type of submission I want, even though it is blank in the main page:

screenshot of dev console

How do I "retrieve" this field, which is necessary for the request to work?

Upvotes: 2

Views: 1455

Answers (1)

That work for me

import requests
import json
from bs4 import BeautifulSoup

# globals
users = []
letter = "A"

# session
session = requests.Session()

# get page
auth = session.get('https://case.occ.ok.gov/ords/f?p=1004:203')
soup = BeautifulSoup(auth.text, 'html.parser')

inputs = soup.select('input')
d_inputs = {i['id']: i.get('value', '') for i in inputs}

# create params
params = {
  'p_flow_id': d_inputs['pFlowId'],
  'p_flow_step_id': d_inputs['pFlowStepId'],
  'p_instance': d_inputs['pInstance'],
  'p_debug': '',
  'p_request': 'Search',
  'p_reload_on_submit': d_inputs['pReloadOnSubmit'],
  'p_page_submission_id': d_inputs['pPageSubmissionId'],
  'p_json': json.dumps({"pageItems": {
    "itemsToSubmit": [
      {"n": "P203_LASTNAME", "v": "{}".format(letter)},
      {"n": "P203_FIRSTNAME", "v": ""},
      {"n": "P203_SEARCH_CRITERIA", "v": "1"}
    ],
    "protected": d_inputs['pPageItemsProtected'],
    "rowVersion": "",
    "formRegionChecksums": []
    },
    "salt": d_inputs['pSalt']
  })
}

# Send request to APEX
session.post(
  'https://case.occ.ok.gov/ords/wwv_flow.accept', data=params
)

# get page with data (first)
data_page = session.get(
  'https://case.occ.ok.gov/ords/f?p=1004:203:{}::NO:::'.format(
    d_inputs['pInstance']
  )
)

table_soup = BeautifulSoup(data_page.text, 'html.parser')

# new params
inputs = table_soup.select('input')
d_inputs = {i['id']: i.get('value', '') for i in inputs}
json_ajax_data = json.loads(data_page.text.split(
    'interactiveReport('
)[1].split(');})();')[0])

# get data for next pages
params_news = {
    'p_flow_id': params['p_flow_id'],
    'p_flow_step_id': params['p_flow_step_id'],
    'p_instance': params['p_instance'],
    'p_debug': '',
    'p_request': 'PLUGIN={}'.format(json_ajax_data['ajaxIdentifier']),
    'p_widget_name': 'worksheet',
    'p_widget_mod': 'ACTION',
    'p_widget_action': 'PAGE',
    'p_widget_action_mod': 'pgR_min_row=51max_rows=50rows_fetched=50',
    'p_widget_num_return': 50,
    'x01': d_inputs['R8980010866046866_worksheet_id'],
    'x02': d_inputs['R8980010866046866_worksheet_id'],
    'p_json': params['p_json']
}

# get next page data
next_page = session.post(
  'https://case.occ.ok.gov/ords/wwv_flow.ajax', data=params_news
)

next_page_soup = BeautifulSoup(next_page.text, 'html.parser')
next_page_table_with_data = table_soup.find('table', {'class': 'a-IRR-table'})
next_page_rows = next_page_table_with_data.find_all('tr')

# parse rows
for row_next_page in next_page_rows:
  cells_next_page = row_next_page.find_all('td')
  if len(cells_next_page) > 0:
    users.append(
      {
        'name': cells_next_page[0].text, 'surname': cells_next_page[1].text
      }
    )

print(users)
[
   {'name': 'ANDERSON', 'surname': 'MICHAEL L AND KAREN'}, 
   {'name': 'ALVAREZ', 'surname': 'PETRA'},
   ...
]

Upvotes: 5

Related Questions