Reputation: 115
I was able to match the header index with the header text index in the code below. What I can't figure out is appending np.NaN when the header is not in soup. This is a follow up from my previous question.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
urls_test = ['https://www.example.com/',
'https://www.example.com/']
engine = []
trans = []
color = []
interior = []
for url in urls_test:
soup = getAndParseURL(url)
ul = soup.select('ul[class="list-inline lot-breakdown-list"] li', recursive=True)
lis_e0 = []
lis_e1 = []
if ul:
for li in ul:
lis0 = []
lis1 = []
lis0.append(li.h5.contents[0])
lis1.append(li.contents[1])
lis_e0.extend(lis0)
lis_e1.extend(lis1)
try:
for i in range(min(len(lis_e1), len(lis_e0))):
if 'Engine' in lis_e0[i]:
engine.append(lis_e1[i])
except:
engine.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Trans' in x:
trans.append(lis_e1[i])
except:
trans.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Color' in x:
color.append(lis_e1[i])
except:
color.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Interior' in x:
interior.append(lis_e1[i])
except:
interior.append(np.NaN)
else:
engine.append(np.NaN)
trans.append(np.NaN)
color.append(np.NaN)
interior.append(np.NaN)
engine
trans
color
interior
print(str(len(engine)))
print(str(len(trans)))
print(str(len(color)))
print(str(len(interior)))
Out:
['383 CI']
['Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
1
1
2
2
Below the output I'm looking for (the for loop for 'engine' is different but should work the same). The length has to match with the number of URL's otherwise the list indexes won't correspond to the correct URL when scraping multiple URL's. Thanks for taking the time!
['NaN', '383 CI']
['NaN', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
2
2
2
2
Upvotes: 1
Views: 167
Reputation: 33384
Use try..except block.
import requests
from bs4 import BeautifulSoup
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
urls_test = ['https://www.example.com/',
'https://www.example.com/']
engine = []
trans = []
color = []
interior = []
for url in urls_test:
soup = getAndParseURL(url)
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine')
engine.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine').next_element.next_element)
except:
engine.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans')
trans.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans').next_element.next_element)
except:
trans.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color')
color.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color').next_element.next_element)
except:
color.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior')
interior.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior').next_element.next_element)
except:
interior.append("Nan")
print(engine)
print(trans)
print(color)
print(interior)
Output:
['Nan', '383 CI']
['Nan', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
To Load in DataFrame.
df=pd.DataFrame({"Engine" : engine,"Trans" : trans,"Color" : color,"Interior":interior})
print(df)
Output:
Color Engine Interior Trans
0 Green Nan Black Nan
1 Curious Yellow 383 CI Black Automatic
Upvotes: 1