Reputation: 48
Trying to parse a simple page and get a file with a similiar table, but neither GoogleDocs, nor Excel, nor VS CSV Editor recognize image links as images. I've already tried to format links into <img src='URL'>
and .jpg/.webp link, but it had no effect.
So how do I parse the images without saving them as files?
from bs4 import BeautifulSoup
import requests
import pandas
url = 'https://www.thecycledb.com/items'
header = ['LOGO','NAME','K-MARKS VALUE','VALUE PER WEIGHT','FACTION XP','XP PER POUND','WEIGHT']
soup = BeautifulSoup(requests.get(url).text,'lxml')
items = []
for item in soup.find_all('tr',class_='hover cursor-pointer group hover:active'):
item_name=item.find('th').text
list=[soup.find_all('img',alt=item_name)[1]['src']]
list.append(item_name)
for elemnt in item.find_all('td')[0:5]:
list.append(elemnt.text)
items.append(list)
table = pandas.DataFrame(items,columns=header)
columns_titles = ['LOGO','NAME','K-MARKS VALUE','VALUE PER WEIGHT','FACTION XP','XP PER POUND','WEIGHT']
table=table.reindex(columns=columns_titles) #swaps the LOGO and NAME columns because I want to
table.to_csv('C:/Users/Arnaud_Roy_is_a_great_compositor/Desktop/VsCodeCode/test.csv',encoding='utf8')
Upvotes: 1
Views: 1096
Reputation: 471
Is this what you're looking for? I've commented out my code, but the key here was to look at the overall structure of the site. Because the tables aren't "joined" as we would normally expect, we have to gather the fields in parts.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.thecycledb.com/items"
soup = BeautifulSoup(requests.get(url).text, "lxml")
# Split header into full list and list of item values only
header_all = ["LOGO", "NAME", "K-MARKS VALUE", "VALUE PER WEIGHT", "FACTION XP", "XP PER POUND", "WEIGHT"]
header = ["K-MARKS VALUE", "VALUE PER WEIGHT", "FACTION XP", "XP PER POUND", "WEIGHT"]
# Get table from page
table = soup.find("table")
table_body = table.find("tbody")
# Get item values
rows = table_body.find_all("tr")
row_collector: list = []
for row in rows:
cols = row.find_all("td")
cols = [ele.text for ele in cols]
cols = [ele.removesuffix("MORE ITEM STATS AND INFO") for ele in cols if ele]
cols.pop()
row_collector.append(cols)
transposed = list(map(list, zip(*row_collector)))
data = dict(zip(header, transposed))
# Get NAME
names = soup.findAll("h2")
data["NAME"] = [name.text for name in names]
# Get LOGO
collector: list = []
for item in soup.find_all("tr", class_="hover cursor-pointer group hover:active"):
collector.extend(img.get("src") for img in item.findAll("img") if
img.get("alt") in data["NAME"] and img.get("src").startswith("https"))
data["LOGO"] = collector[1::2]
# Make frame
table = pd.DataFrame(data)
table = table[header_all]
print(table)
table.to_csv("C:/Users/Arnaud_Roy_is_a_great_compositor/Desktop/VsCodeCode/test.csv", encoding="utf8")
def render_imgs(path: str = None) -> str:
"""Format the image URLs in <img> tags"""
return f"""<img src="{path}" width="60" >"""
table.to_html("table.html", escape=False, formatters=dict(LOGO=render_imgs))
Upvotes: 1