Reputation: 306
The code below gives me the required product data as a table. It works fine for most links, however in some it stops midway giving an error NoneType
object has no attribute find_all
in table.find_all('tr)
.
I believe this is because table does not exist in some products, so I tried creating an if condition on the existence of the table, but that also doesn't seem to help. What changes should I make in the code below?
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
data.append(temp)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Upvotes: 0
Views: 45
Reputation: 333
you can use this:
tables = soup.select('table', attrs={"class":"specifications"})
rows = tables.findChildren(['tr'])
Upvotes: 1
Reputation: 17368
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
try:
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
print("Failed for URL {}".format(prod_url))
data.append(temp)
time.sleep(2)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Put a try/except
not only to extract product specification but also to extract item/brand/sku. But in the except put a print statement to know which all urls failed so that you can try them again
Upvotes: 0
Reputation: 1769
You can use Try
and Except
(documentation):
try:
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
pass
Upvotes: 2