BeautifulSoup - how do i scrape multiple links to then scrape contents of links

Question

I'm trying to do a scrape where the landing page has various links (the 5 sub categories at the top): https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html

Within each of these categories are a list of products https://mcavoyguns.co.uk/contents/en-uk/d411_Browning_B725_Shotguns.html

Each product listed has a link to get further details (a direct link to the product as an individual page) https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.html

The scrape I've put together so far will get as far as creating a list of all the individual page links required. But when I try to loop each individual product link for data, I cant seem to get the BeautifulSoup to map the data from those links. Its as though it stays on the previous page (if you will).
What am I missing to allow for that second "bounce" to the "product_link" address (eg https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.html) and allow me to scrape the data from there? I had thought I might need to add a time.sleep(5) timer to allow for all to load but still getting nothing.

Code:

from bs4 import BeautifulSoup 
import math 
import requests 
import shutil 
import csv 
import pandas 
import numpy as np 
from pandas import DataFrame 
import re
import os 
import urllib.request as urllib2 
import locale 
import json 
from selenium import webdriver 
import lxml.html 
import time 
from selenium.webdriver.support.ui import Select  
os.environ["PYTHONIOENCODING"] = "utf-8" 


#selenium requests 

browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
time.sleep(2) 

all_Outlinks=[] 
all_links=[]

soup = BeautifulSoup(browser.page_source, features="lxml") 
submenuFind = soup.find("div", "idx2Submenu") 
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub") 

for submenuItem in submenuItems: 
    for link in submenuItem.select('a[href]'): 
        all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href']) 
#print(all_Outlinks) 

for a_link in all_Outlinks:
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'html.parser') 
    pageLinkDivs = soup.find_all("div", "column full")
    for pageLinkDiv in pageLinkDivs:
        for pageLink in pageLinkDiv.select('a[href]'):
            all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
            
for product_link in all_links:
    time.sleep(5)
    resSecond = requests.get(product_link)
    soup = BeautifulSoup(resSecond.text, 'html.parser')
    model = soup.find("div", "GC75 ProductChoiceName")
    print(model)

PS Apologies for the additional imports. They are copy and paste from a previous script to be removed once confirmed not required.

QHarr · Accepted Answer

That info is pulled dynamically from a script tag when using browser. As using requests this will not be in the location you might be looking. Instead, pull that info from the script tag.

In this case, I pull all the info related to a given model that is within the script and generate a dataframe. I convert the string inside the script tag to a python object with ast. I add the product url and product title to the dataframe.

Each df is added to a list which is converted to a final dataframe. As I don't know what final header names would be required I have left some with their default names.

I have added in handling for the case(s) where there are no model options listed for the given product.

from bs4 import BeautifulSoup 
import math 
import requests 
import shutil 
import csv 
import pandas as pd
import numpy as np 
import re
import os 
import urllib.request as urllib2 
import locale 
import json 
from selenium import webdriver 
import lxml.html 
import time 
from selenium.webdriver.support.ui import Select  
import ast

os.environ["PYTHONIOENCODING"] = "utf-8" 

#selenium requests 
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
time.sleep(2) 

all_Outlinks=[] 
all_links=[]

soup = BeautifulSoup(browser.page_source, features="lxml") 
submenuFind = soup.find("div", "idx2Submenu") 
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub") 

for submenuItem in submenuItems: 
    for link in submenuItem.select('a[href]'): 
        all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href']) 
#print(all_Outlinks) 

with requests.Session() as s:
    
    for a_link in all_Outlinks:
        res = requests.get(a_link) 
        soup = BeautifulSoup(res.text, 'html.parser') 
        pageLinkDivs = soup.find_all("div", "column full")
        for pageLinkDiv in pageLinkDivs:
            for pageLink in pageLinkDiv.select('a[href]'):
                all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
    
    results = []
    
    for product_link in all_links:
        # print(product_link)
        resSecond = s.get(product_link)
        soup = BeautifulSoup(resSecond.text, 'html.parser')
        title = soup.select_one('.ProductTitle').text
        
        try:
            df = pd.DataFrame(ast.literal_eval(re.search(r'($$\[.*$$\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
            df.iloc[:, -1] = product_link
        except:
            placeholder = ['No options listed'] * 8
            placeholder.append(product_link)
            df = pd.DataFrame([placeholder])
        
        df.insert(0, 'title', title)
        
        #print(df) # add headers you care about to df or do that at end on full list
        results.append(df)
final = pd.concat(results) # or add header here
print(final)

You could then look at speeding/tidying things up:

from bs4 import BeautifulSoup 
import requests 
import pandas as pd
import re
import os 
import locale 
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
import ast
from multiprocessing import Pool, cpu_count

def get_models_df(product_link):
    res = requests.get(product_link)
    soup = BeautifulSoup(res.text, 'lxml')
    title = soup.select_one('.ProductTitle').text

    try:
        df = pd.DataFrame(ast.literal_eval(re.search(r'($$\[.*$$\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
        df.iloc[:, -1] = product_link
    except:
        placeholder = ['No options listed'] * 8
        placeholder.append(product_link)
        df = pd.DataFrame([placeholder])

    df.insert(0, 'title', title)
    return(df)


def get_all_pages(a_link):
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'lxml') 
    all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]   
    return all_links

if __name__ == '__main__':
    os.environ["PYTHONIOENCODING"] = "utf-8" 

    #selenium requests 
    browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
    browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
    all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
    browser.quit()
    
    with Pool(cpu_count()-1) as p:

        nested_links = p.map(get_all_pages , all_outlinks)
        flat_list = [link for links in nested_links for link in links]   
        results = p.map(get_models_df, flat_list)
        final = pd.concat(results)
        #print(final)
        final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)

So I said I would have a look at the other requested items and they are indeed available just with requests. Some things that needed handling:

Different headers present for different products; some missing headers
Some unicode characters (there are still some encoding things to look at)
Handling cases where description missing
Handling the more section
Updating certain output values so Excel doesn't convert them to dates
Handling of header nan

TODO:

One of the functions has now become a rabid monster and needs re-factoring into smaller friendly function calls.

from bs4 import BeautifulSoup 
import requests 
import pandas as pd
import re
import os 
import locale 
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
import ast
from multiprocessing import Pool, cpu_count
import numpy as np
import unicodedata

def get_models_df(product_link):

    resSecond = requests.get(product_link)
    soup = BeautifulSoup(resSecond.text, 'lxml')
    title = soup.select_one('.ProductTitle').text

    try:
        df = pd.DataFrame(ast.literal_eval(re.search(r'($$\[.*$$\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
        
    except:
        placeholder = ['No options listed'] * 8
        df = pd.DataFrame([placeholder])
    
    df.insert(0, 'title', title)
    df['price'] = ' '.join([soup.select_one("[property='product:price:amount']")['content'], 
                   soup.select_one("[property='product:price:currency']")['content']])
    df['weight'] = ' '.join([soup.select_one("[property='product:weight:value']")['content'], 
                    soup.select_one("[property='product:weight:units']")['content']])

    output_headers = ['Action frame', 'Barrel','Barrel finish','Barrel length', 
                      'Barrel length (mm-inch)','Buttstock','Calibre','Chokes','Code',
                      'Drop at comb','Drop at heel','Forearm','Length','N/A','Notes',
                      'Options','Packaging','Sights','Stock style','Top rib','Weight','Wood','Wood grade'
                     ]
    
    df = pd.concat([df, pd.DataFrame(columns = output_headers)])
    
    try:
        description_table = pd.read_html(str(soup.select_one('.ProductDetailedDescription table, table')))[0].transpose()
        description_table.dropna(axis=0, how='all',inplace=True)
        headers = list(description_table.iloc[0,:])
        headers[:] = ['N/A' if pd.isnull(np.array([header], dtype=object)) else header for header in headers]
        
        for number, header in enumerate(headers):
            temp = header.lower()
            value = description_table.iloc[1, number]
            if temp == 'calibre':
                df[header] = "'" + value
            elif  temp == 'top rib' and 'mm' not in value:
                df[header] = value + 'mm'
            else:
                df[header] = value
     
    except:
        pass # no table
        
    description = soup.select_one('#ProductDetailsTab [title=More]')
    
    if description is None:
        desc = 'N/A'
    else:
        desc = '. '.join([i.text for i in soup.select('.ProductDescription li, .ProductDescription span') if i.text !=''])
        if desc == '':
            desc = soup.select_one('.ProductIntroduction').get_text()

    df['desc'] = unicodedata.normalize('NFKD', desc)   
    df['product_link'] = product_link
    
    return(df)

def get_all_pages(a_link):
        
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'lxml') 
    all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]

    return all_links

if __name__ == '__main__':
    #os.environ["PYTHONIOENCODING"] = "utf-8" 

    #selenium requests 
    browser = webdriver.Chrome()# executable_path='C:/Users/admin/chromedriver.exe')
    browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
    all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
    browser.quit()

    with Pool(cpu_count()-1) as p:

        nested_links = p.map(get_all_pages , all_outlinks)
        flat_list = [link for links in nested_links for link in links]
        results = p.map(get_models_df, flat_list)
        final = pd.concat(results)
        #print(final)
        final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)

BeautifulSoup - how do i scrape multiple links to then scrape contents of links

Answers (2)

Related Questions