Michael Nikitin
Michael Nikitin

Reputation: 48

Turn link to an image in Pandas Dataframe

Trying to parse a simple page and get a file with a similiar table, but neither GoogleDocs, nor Excel, nor VS CSV Editor recognize image links as images. I've already tried to format links into <img src='URL'> and .jpg/.webp link, but it had no effect. So how do I parse the images without saving them as files?

from bs4 import BeautifulSoup
import requests
import pandas 


url = 'https://www.thecycledb.com/items'
header = ['LOGO','NAME','K-MARKS VALUE','VALUE PER WEIGHT','FACTION XP','XP PER POUND','WEIGHT']

soup = BeautifulSoup(requests.get(url).text,'lxml')

items = []

for item in soup.find_all('tr',class_='hover cursor-pointer group hover:active'):
    item_name=item.find('th').text
    list=[soup.find_all('img',alt=item_name)[1]['src']]  
    list.append(item_name)
    for elemnt in item.find_all('td')[0:5]:
        list.append(elemnt.text)
    items.append(list)



table = pandas.DataFrame(items,columns=header)
columns_titles = ['LOGO','NAME','K-MARKS VALUE','VALUE PER WEIGHT','FACTION XP','XP PER POUND','WEIGHT']
table=table.reindex(columns=columns_titles) #swaps the LOGO and NAME columns because I want to

table.to_csv('C:/Users/Arnaud_Roy_is_a_great_compositor/Desktop/VsCodeCode/test.csv',encoding='utf8')

Upvotes: 1

Views: 1096

Answers (1)

Ray
Ray

Reputation: 471

Is this what you're looking for? I've commented out my code, but the key here was to look at the overall structure of the site. Because the tables aren't "joined" as we would normally expect, we have to gather the fields in parts.

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://www.thecycledb.com/items"
soup = BeautifulSoup(requests.get(url).text, "lxml")

# Split header into full list and list of item values only
header_all = ["LOGO", "NAME", "K-MARKS VALUE", "VALUE PER WEIGHT", "FACTION XP", "XP PER POUND", "WEIGHT"]
header = ["K-MARKS VALUE", "VALUE PER WEIGHT", "FACTION XP", "XP PER POUND", "WEIGHT"]

# Get table from page
table = soup.find("table")
table_body = table.find("tbody")

# Get item values
rows = table_body.find_all("tr")
row_collector: list = []
for row in rows:
    cols = row.find_all("td")
    cols = [ele.text for ele in cols]
    cols = [ele.removesuffix("MORE ITEM STATS AND INFO") for ele in cols if ele]
    cols.pop()
    row_collector.append(cols)
transposed = list(map(list, zip(*row_collector)))
data = dict(zip(header, transposed))

# Get NAME
names = soup.findAll("h2")
data["NAME"] = [name.text for name in names]

# Get LOGO
collector: list = []
for item in soup.find_all("tr", class_="hover cursor-pointer group hover:active"):
    collector.extend(img.get("src") for img in item.findAll("img") if
                     img.get("alt") in data["NAME"] and img.get("src").startswith("https"))
data["LOGO"] = collector[1::2]

# Make frame
table = pd.DataFrame(data)
table = table[header_all]
print(table)
table.to_csv("C:/Users/Arnaud_Roy_is_a_great_compositor/Desktop/VsCodeCode/test.csv", encoding="utf8")

Render images

def render_imgs(path: str = None) -> str:
    """Format the image URLs in <img> tags"""
    return f"""<img src="{path}" width="60" >"""


table.to_html("table.html", escape=False, formatters=dict(LOGO=render_imgs))

enter image description here

Upvotes: 1

Related Questions