Reputation: 389
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class" : "tile-content"})
g_price = soup.find_all("div",{"class" : "item-price-container"})
g_star = soup.find_all("div",{"class" : "stars stars-small tile-row"})
data=defaultdict(list)
for product_title in g_data:
a_product_title = product_title.find_all("a","js-product-title")
for text_product_title in a_product_title :
data['Product Title'].append(textroduct_title.text)
for row in g_price:
price = row.find('span', 'price price-display').text.strip()
data['Price'].append(price)
for allstar in g_star:
star=allstar.find('span','visuallyhidden').text.stp()
data['Stars'].append(star)
dd_starring = soup.find_all('dd', {"class" : "media-details-artist-dd module"})
for dd in dd_starring :
actors = dd.text
#data['Actors'].append(actors)
df = pd.DataFrame(data)
df
if I add to try to append it using the line data['Stars'].append(star)
- I get the following error--
ValueError: arrays must all be the same length
What should be done to append it and the rows that don't have a star should have NA in it.
Any suggestions? please help
Upvotes: 1
Views: 1705
Reputation: 1470
Your original issue was caused because your individual loops did not contain the same amount of each element you were looping through (i.e. - 15 stars, v. 20 prices). The best way to avoid this type of issue is to firstly have one loop and then to apply a try & except value to each item you're scaping. That way if there are any issues with the constant presence of the items you want, you can still collect what is present.
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class" : "tile-content"})
data=defaultdict(list)
#One loop to rule them all
for tile in g_data:
#the "tile" value in g_data contains what you are looking for...
#find the product titles
try:
title = tile.find("a","js-product-title")
data['Product Title'].append(title.text)
except:
data['Product Title'].append("")
#find the prices
try:
price = tile.find('span', 'price price-display').text.strip()
data['Price'].append(price)
except:
data['Price'].append("")
#find the stars
try:
g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
data['Stars'].append(g_star)
except:
data['Stars'].append("")
df = pd.DataFrame(data)
Upvotes: 0
Reputation: 12801
You don't need to build individual lists of content to loop through. You can just iterate through g_data, and this means you won't have different length result sets.
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class" : "tile-content"})
data=defaultdict(list)
for content in g_data:
title = content.find("a","js-product-title")
data['Product Title'].append(title.text)
try:
stars =content.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
data['Stars'].append(stars)
except:
data['Stars'].append(None)
price = content.find('span', 'price price-display').text.strip()
data['Price'].append(price)
#data['Actors'].append(actors)
df = pd.DataFrame(data)
df
As far as I could see the inner loops were also not necessary as each item only has one name, price and rating.
Upvotes: 2