MITHU
MITHU

Reputation: 164

Failed to identify the reason why my script is missing a few results while scraping a webpage

I've created a script in Python to scrape consultant links from this webpage based on the country filter United States, located in the left sidebar. The webpage shows 2,025 results. However, when I run the script, I always get 2,016 results—9 fewer than expected. How can I retrieve the missing results?

import time
import json
import requests
from pprint import pprint
from bs4 import BeautifulSoup

url = 'https://appexchange.salesforce.com/consulting'
link = 'https://appexchange.salesforce.com/appxstore'

filter_payload = {
    'AJAXREQUEST': '_viewRoot',
    'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm': 'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm',
    'store-certified experts-option1': 'on',
    'store-certified experts-option2': 'on',
    'store-certified experts-option3': 'on',
    'store-certified experts-option4': 'on',
    'store-certified experts-option5': 'on',
    'store-certified experts-option6': 'on',
    'store-certified experts-option7': 'on',
    'store-ratings-option1': 'on',
    'store-ratings-option2': 'on',
    'store-ratings-option3': 'on',
    'store-ratings-option4': 'on',
    'store-ratings-option5': 'on',
    'store-ratings-option6': 'on',
    'com.salesforce.visualforce.ViewState': '',
    'com.salesforce.visualforce.ViewStateVersion': '',
    'com.salesforce.visualforce.ViewStateMAC': '',
    'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm:j_id1019': 'j_id0:AppxLayout:j_id1007:j_id1008:filtersForm:j_id1019',
    'isReset': 'false',
    'filtersUrl': '40,41,42,43,44,45,46,rt5,rt4,rt3,rt2,rt1,rt0,co=co-US,Choose...'
}
payload = {
    'AJAXREQUEST': '_viewRoot',
    'j_id0:AppxLayout:actionsForm': 'j_id0:AppxLayout:actionsForm',
    'com.salesforce.visualforce.ViewState': '',
    'com.salesforce.visualforce.ViewStateVersion': '',
    'com.salesforce.visualforce.ViewStateMAC': '',
    'j_id0:AppxLayout:actionsForm:j_id5036': 'j_id0:AppxLayout:actionsForm:j_id5036',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
}


def update_viewstate(soup, target_payload):
    for key in ['ViewState', 'ViewStateVersion', 'ViewStateMAC']:
        target_payload[f'com.salesforce.visualforce.{key}'] = soup.select_one(f"[id='com.salesforce.visualforce.{key}']")['value']


with requests.Session() as session:
    session.headers.update(headers)
    res = session.get(url)
    soup = BeautifulSoup(res.text,"xml")
    update_viewstate(soup, filter_payload)

    while True:
        resp = session.post(link,data=filter_payload)
        soup_obj = BeautifulSoup(resp.text,"xml")
        if not soup_obj.select_one("a.appx-tile-consultant"):
            print("No more consultant links found.")
            break

        for item in soup_obj.select("a.appx-tile-consultant"):
            print(item.get('href'))

        update_viewstate(soup_obj, payload)
        filter_payload = payload.copy()

Upvotes: 0

Views: 93

Answers (1)

Bako
Bako

Reputation: 19

There are different response returned in some cases by Ajax call, and in some cases last response needs Ajax additional rendering that cannot be done by BeautifulSoup, so I recommend to use Selenium because you can easy debug also what is going on through the scrape and is way more faster. Here is the code:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

URL = "https://appexchange.salesforce.com/consulting"

chrome_options = Options()
chrome_options.add_argument("--headless") 
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")  

service = Service("chromedriver") 
driver = webdriver.Chrome(service=service, options=chrome_options)

def interceptor(request):
    del request.headers['User-Agent']
    request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'

driver.request_interceptor = interceptor

try:
    driver.get(URL)

    wait = WebDriverWait(driver, 10)

    consultant_links = set()
    counter = 0
    
    usOnly = driver.find_element(By.CSS_SELECTOR, "option[value='co=co-US']")
    driver.execute_script("return arguments[0].selected=true;", usOnly)
    
    filterButton = driver.find_element(By.CSS_SELECTOR, "span.appx-button-filter-apply-long")
    driver.execute_script("arguments[0].click();", filterButton)
    WebDriverWait(driver, 10).until(EC.staleness_of(filterButton))

    while True:
        try:
            load_more = driver.find_element(By.CSS_SELECTOR, "button.appx-load-more-button")
            driver.execute_script("arguments[0].click();", load_more)
            WebDriverWait(driver, 10).until(EC.staleness_of(load_more))  
        except Exception:
            consultants = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.appx-tile-consultant")))

            for consultant in consultants:
                link = consultant.get_attribute("href")
                if link not in consultant_links:
                    consultant_links.add(link)
                    counter += 1
                    print(link)
            break 

    print(f"Total consultants found: {counter}")

finally:
    driver.quit() 

If u like just comment out headless line for options and u will see whole scrape process.

For this script to work u need also to download chrome web driver and put it executable in same directory as this script. Example download link:

https://chromedriver.storage.googleapis.com/index.html?path=2.33/

Btw you will see that actually are 2016 results rendered and not 2025 so probably that number is a problem on their side.

Upvotes: 2

Related Questions