avgjoe13
avgjoe13

Reputation: 115

Scrape the text below a header while matching index with header value using BeautifulSoup and pandas

I was able to match the header index with the header text index in the code below. What I can't figure out is appending np.NaN when the header is not in soup. This is a follow up from my previous question.

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

urls_test = ['https://www.example.com/',
            'https://www.example.com/']

engine = []
trans = []
color = []
interior = []

for url in urls_test:
    soup = getAndParseURL(url)
    ul   = soup.select('ul[class="list-inline lot-breakdown-list"] li', recursive=True)
    lis_e0 = []
    lis_e1 = []
    if ul:
        for li in ul:
            lis0 = []
            lis1 = []
            lis0.append(li.h5.contents[0])
            lis1.append(li.contents[1])
            lis_e0.extend(lis0) 
            lis_e1.extend(lis1) 
        try:        
            for i in range(min(len(lis_e1), len(lis_e0))):
                if 'Engine' in lis_e0[i]:
                    engine.append(lis_e1[i])   
        except:
            engine.append(np.NaN)
        try:
            for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
                if 'Trans' in x:
                    trans.append(lis_e1[i])  
        except:
            trans.append(np.NaN)
        try:
            for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
                if 'Color' in x:
                    color.append(lis_e1[i])  
        except:
            color.append(np.NaN)
        try:
            for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
                if 'Interior' in x:
                    interior.append(lis_e1[i])  
        except:
            interior.append(np.NaN)
    else:
        engine.append(np.NaN)
        trans.append(np.NaN)
        color.append(np.NaN)
        interior.append(np.NaN)

engine
trans
color
interior

print(str(len(engine)))
print(str(len(trans)))
print(str(len(color)))
print(str(len(interior)))

Out:
['383 CI']
['Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
1
1
2
2

Below the output I'm looking for (the for loop for 'engine' is different but should work the same). The length has to match with the number of URL's otherwise the list indexes won't correspond to the correct URL when scraping multiple URL's. Thanks for taking the time!

['NaN', '383 CI']
['NaN', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
2
2
2
2

Upvotes: 1

Views: 167

Answers (1)

KunduK
KunduK

Reputation: 33384

Use try..except block.

import requests
from bs4 import BeautifulSoup

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

urls_test = ['https://www.example.com/',
            'https://www.example.com/']

engine = []
trans = []
color = []
interior = []

for url in urls_test:
    soup = getAndParseURL(url)

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine')
        engine.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine').next_element.next_element)
    except:
        engine.append("Nan")

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans')
        trans.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans').next_element.next_element)
    except:
        trans.append("Nan")

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color')
        color.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color').next_element.next_element)
    except:
        color.append("Nan")

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior')
        interior.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior').next_element.next_element)
    except:
        interior.append("Nan")

print(engine)
print(trans)
print(color)
print(interior)

Output:

['Nan', '383 CI']
['Nan', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']

To Load in DataFrame.

df=pd.DataFrame({"Engine" : engine,"Trans" : trans,"Color" : color,"Interior":interior})
print(df)

Output:

            Color  Engine Interior      Trans
0           Green     Nan    Black        Nan
1  Curious Yellow  383 CI    Black  Automatic

Upvotes: 1

Related Questions