Divya Jose
Divya Jose

Reputation: 389

How to Scrape a web-page into a dictionary compatible with Pandas Dataframe?

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})
g_price = soup.find_all("div",{"class" : "item-price-container"})
g_star = soup.find_all("div",{"class" : "stars stars-small tile-row"})

data=defaultdict(list)
for product_title in g_data:
    a_product_title = product_title.find_all("a","js-product-title")
    for text_product_title in a_product_title : 
       data['Product Title'].append(textroduct_title.text)  

for row in g_price:
    price = row.find('span', 'price price-display').text.strip()
    data['Price'].append(price)

for allstar in g_star:
    star=allstar.find('span','visuallyhidden').text.stp()
    data['Stars'].append(star)

dd_starring = soup.find_all('dd', {"class" : "media-details-artist-dd module"})
for dd in dd_starring :
     actors = dd.text
 #data['Actors'].append(actors)

df = pd.DataFrame(data)
df  

output of stars (without the line - data['Stars'].append(star)output of appending product title and price to df


if I add to try to append it using the line data['Stars'].append(star) - I get the following error--

ValueError: arrays must all be the same length

What should be done to append it and the rows that don't have a star should have NA in it.

Any suggestions? please help

Upvotes: 1

Views: 1705

Answers (2)

unique_beast
unique_beast

Reputation: 1470

Your original issue was caused because your individual loops did not contain the same amount of each element you were looping through (i.e. - 15 stars, v. 20 prices). The best way to avoid this type of issue is to firstly have one loop and then to apply a try & except value to each item you're scaping. That way if there are any issues with the constant presence of the items you want, you can still collect what is present.

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})

data=defaultdict(list)

#One loop to rule them all
for tile in g_data:
    #the "tile" value in g_data contains what you are looking for...
    #find the product titles
    try:
        title = tile.find("a","js-product-title")
        data['Product Title'].append(title.text)
    except:
        data['Product Title'].append("")

    #find the prices
    try:
        price = tile.find('span', 'price price-display').text.strip()
        data['Price'].append(price)  
    except:
        data['Price'].append("")

    #find the stars
    try:
        g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
        data['Stars'].append(g_star)
    except:
        data['Stars'].append("")

df = pd.DataFrame(data)

Upvotes: 0

JAB
JAB

Reputation: 12801

You don't need to build individual lists of content to loop through. You can just iterate through g_data, and this means you won't have different length result sets.

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})

data=defaultdict(list)
for content in g_data:
    title = content.find("a","js-product-title")
    data['Product Title'].append(title.text)

    try:
        stars =content.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
        data['Stars'].append(stars)

    except:
        data['Stars'].append(None)

    price = content.find('span', 'price price-display').text.strip()
    data['Price'].append(price)



 #data['Actors'].append(actors)

df = pd.DataFrame(data)
df  

As far as I could see the inner loops were also not necessary as each item only has one name, price and rating.

Upvotes: 2

Related Questions