Reputation: 164
I've created a script in Python to scrape consultant links from this webpage based on the country filter United States
, located in the left sidebar. The webpage shows 2,025 results. However, when I run the script, I always get 2,016 results—9 fewer than expected. How can I retrieve the missing results?
import time
import json
import requests
from pprint import pprint
from bs4 import BeautifulSoup
url = 'https://appexchange.salesforce.com/consulting'
link = 'https://appexchange.salesforce.com/appxstore'
filter_payload = {
'AJAXREQUEST': '_viewRoot',
'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm': 'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm',
'store-certified experts-option1': 'on',
'store-certified experts-option2': 'on',
'store-certified experts-option3': 'on',
'store-certified experts-option4': 'on',
'store-certified experts-option5': 'on',
'store-certified experts-option6': 'on',
'store-certified experts-option7': 'on',
'store-ratings-option1': 'on',
'store-ratings-option2': 'on',
'store-ratings-option3': 'on',
'store-ratings-option4': 'on',
'store-ratings-option5': 'on',
'store-ratings-option6': 'on',
'com.salesforce.visualforce.ViewState': '',
'com.salesforce.visualforce.ViewStateVersion': '',
'com.salesforce.visualforce.ViewStateMAC': '',
'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm:j_id1019': 'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm:j_id1019',
'isReset': 'false',
'filtersUrl': '40,41,42,43,44,45,46,rt5,rt4,rt3,rt2,rt1,rt0,co=co-US,Choose...'
}
payload = {
'AJAXREQUEST': '_viewRoot',
'j_id0:AppxLayout:actionsForm': 'j_id0:AppxLayout:actionsForm',
'com.salesforce.visualforce.ViewState': '',
'com.salesforce.visualforce.ViewStateVersion': '',
'com.salesforce.visualforce.ViewStateMAC': '',
'j_id0:AppxLayout:actionsForm:j_id5036': 'j_id0:AppxLayout:actionsForm:j_id5036',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
}
def update_viewstate(soup, target_payload):
for key in ['ViewState', 'ViewStateVersion', 'ViewStateMAC']:
target_payload[f'com.salesforce.visualforce.{key}'] = soup.select_one(f"[id='com.salesforce.visualforce.{key}']")['value']
with requests.Session() as session:
session.headers.update(headers)
res = session.get(url)
soup = BeautifulSoup(res.text,"xml")
update_viewstate(soup, filter_payload)
while True:
resp = session.post(link,data=filter_payload)
soup_obj = BeautifulSoup(resp.text,"xml")
if not soup_obj.select_one("a.appx-tile-consultant"):
print("No more consultant links found.")
break
for item in soup_obj.select("a.appx-tile-consultant"):
print(item.get('href'))
update_viewstate(soup_obj, payload)
filter_payload = payload.copy()
Upvotes: 0
Views: 93
Reputation: 19
There are different response returned in some cases by Ajax call, and in some cases last response needs Ajax additional rendering that cannot be done by BeautifulSoup, so I recommend to use Selenium because you can easy debug also what is going on through the scrape and is way more faster. Here is the code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
URL = "https://appexchange.salesforce.com/consulting"
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
service = Service("chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)
def interceptor(request):
del request.headers['User-Agent']
request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
driver.request_interceptor = interceptor
try:
driver.get(URL)
wait = WebDriverWait(driver, 10)
consultant_links = set()
counter = 0
usOnly = driver.find_element(By.CSS_SELECTOR, "option[value='co=co-US']")
driver.execute_script("return arguments[0].selected=true;", usOnly)
filterButton = driver.find_element(By.CSS_SELECTOR, "span.appx-button-filter-apply-long")
driver.execute_script("arguments[0].click();", filterButton)
WebDriverWait(driver, 10).until(EC.staleness_of(filterButton))
while True:
try:
load_more = driver.find_element(By.CSS_SELECTOR, "button.appx-load-more-button")
driver.execute_script("arguments[0].click();", load_more)
WebDriverWait(driver, 10).until(EC.staleness_of(load_more))
except Exception:
consultants = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.appx-tile-consultant")))
for consultant in consultants:
link = consultant.get_attribute("href")
if link not in consultant_links:
consultant_links.add(link)
counter += 1
print(link)
break
print(f"Total consultants found: {counter}")
finally:
driver.quit()
If u like just comment out headless line for options and u will see whole scrape process.
For this script to work u need also to download chrome web driver and put it executable in same directory as this script. Example download link:
https://chromedriver.storage.googleapis.com/index.html?path=2.33/
Btw you will see that actually are 2016 results rendered and not 2025 so probably that number is a problem on their side.
Upvotes: 2