Reputation: 115
I am trying to scrape and store some items using BeautifulSoup and pandas. The code below only partially works. As you can see it scrapes 'Engine426/425 HP' whereas I only want the string '426/425 HP' to be stored in the 'engine' column. I would like to scrape all 4 h5 strings in the HTML below (Please refer to the desired output below). I hope someone can help me out, thanks!
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
main_url = "https://www.example.com/"
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
soup = getAndParseURL(main_url)
engine = []
engine.append(soup.find("ul", class_ = re.compile('list-inline lot-breakdown-list')).li.text)
scraped_data = pd.DataFrame({'engine': engine})
scraped_data.head()
engine
0 Engine426/425 HP
HTML
<div class="lot-breakdown">
<ul class="list-inline lot-breakdown-list">
<li>
<h5>Engine</h5>426/425 HP</li>
<li>
<h5>Trans</h5>Automatic</li>
<li>
<h5>Color</h5>Alpine White</li>
<li>
<h5>Interior</h5>Black</li>
</ul>
</div>
Desired output
scraped_data[['engine', 'trans', 'color', 'interior']] = pd.DataFrame([['426/425 HP', 'Automatic', 'Alpine White', 'Black']], index=scraped_data.index)
scraped_data
engine trans color interior
0 426/425 HP Automatic Alpine White Black
Upvotes: 1
Views: 310
Reputation: 1710
You can achieve that in too many ways :
from bs4 import BeautifulSoup , NavigableString
import requests
main_url = "https://www.example.com/"
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
soup = getAndParseURL(main_url)
#ul = soup.select('ul[class="list-inline lot-breakdown-list"] li')
#for li in ul :
#x = li.find(text=True, recursive=False) # Will give you the text of the li skipping the text of child tag
#y = ' '.join([t for t in li.contents if type(t)== NavigableString]) # contents [<h5>Engine</h5>, '426/425 HP'] the text you want has a type of NavigableString and That's what we are returning .
ul = soup.select('ul[class="list-inline lot-breakdown-list"] li', recursive=True)
lis_e = []
for li in ul:
lis = []
lis.append(li.contents[1])
lis_e.extend(lis)
engine.append(lis_e[0])
trans.append(lis_e[1])
color.append(lis_e[2])
interior.append(lis_e[3])
scraped_data = pd.DataFrame({'engine': engine, 'transmission': trans, 'color': color, 'interior': interior})
scraped_data
Upvotes: 3