Reputation: 35
This is the HTML file in which i have to extract title and alt value in img class.
<div id="BVCustomerRatings" class="BVBrowserFF">
<div class="BVRRRootElement">
<div class="BVRRRatingSummary BVRRPrimaryRatingSummary">
<div class="BVRRRatingSummaryStyle2">
<div class="BVRRRatingSummaryHeader"></div>
<div class="BVRROverallRatingContainer">
<div class="BVRRRatingContainerStar">
<div class="BVRRRatingEntry BVRROdd">
<div id="BVRRRatingOverall_" class="BVRRRating BVRRRatingNormal BVRRRatingOverall">
<div class="BVRRLabel BVRRRatingNormalLabel"></div>
<div class="BVRRRatingNormalImage">
<img class="BVImgOrSprite" width="75" height="15" **title="3.4 out of 5" alt="3.4 out of 5"** src="http://walmart.ugc.bazaarvoice.com/1336/3_4/5/rating.png"></img>
</div>
<div class="BVRRRatingNormalOutOf"></div>
</div>
</div>
</div>
</div>
This is my code!!
from bs4 import BeautifulSoup, Tag
import urllib2
import re
def complete_url(items_url):
items="http://www.walmart.com"+items_url
main_source=(urllib2.urlopen(items)).read()
soup=BeautifulSoup(main_source)
#Title=soup.find('h1',{"class":"productTitle"}).text.strip()
#Price=soup.find('span',{"class":"bigPriceText1"}).text.strip()+soup.find('span',{"class":"smallPriceText1"}).text.strip()
#Availability=soup.find('span',{"id":"STORE_AVAIL"}).text.strip()
#Description=soup.find('span',{"class":"ql-details-short-desc"}).text.strip()
images=soup.find('img',re.compile("bazaarvoice"))
print images
#print 'Title:%s,Price:%s,Availability:%s,Description:%s,Avg_Rating:%s' %(Title,Price,Availability,Description,Avg_Rating)
def url_soup(url):
source=(urllib2.urlopen(url)).read()
soup=BeautifulSoup(source)
link=soup.select('a.ListItemLink')
for links in link:
item_links=(links['href'])
link1=soup.find('a',href=True,text=re.compile("Next"))
link2=soup.find('a',class_="SPPagNoLink jump next")
complete_url(item_links)
if link2 is None:
next_url=('http://www.walmart.com/search/search-ng.do'+re.sub(r'\s','',link1['href']))
url_soup(next_url)
else:
print "<<<<Last Page Reached>>>>"
Dept={"All Departments":"0","Apparel":"5438","Auto":"91083","Baby":"5427","Beauty":"1085666",
"Books":"3920","Electronics":"3944","Gifts":"1094765","Grocery":"976759","Health":"976760",
"Home":"4044","Home Improvement":"1072864","Jwelery":"3891","Movies":"4096","Music":"4104",
"Party":"2637","Patio":"5428","Pets":"5440","Pharmacy":"5431","Photo Center":"5426",
"Sports":"4125","Toys":"4171","Video Games":"2636"}
def gen_url(keyword,domain):
if domain in Dept.keys():
main_url=('http://www.walmart.com/search/search-ng.do?search_query='+'%s'+'&ic=16_0&Find=Find&search_constraint='+'%s') % (keyword,Dept.get(domain))
print main_url
url_soup(main_url)
gen_url('Laptop','All Departments')
Upvotes: 1
Views: 4981
Reputation: 12092
It is pretty straightforward. You use get()
to get the values of the attributes of a tag:
In [1]: from bs4 import BeautifulSoup
In [2]: html = '''
...: <div id="BVRRRatingOverall_" class="BVRRRating BVRRRatingNormal BVRRRatingOverall">
...: <div class="BVRRLabel BVRRRatingNormalLabel"></div>
...: <div class="BVRRRatingNormalImage">
...: <img class="BVImgOrSprite" width="75" height="15" title="3.4 out of 5" alt="3.4 out of 5" src="http://walmart.ugc.bazaarvoice.com/1336/3_4/5/rating.png"></img>
...: </div>
...: <div class="BVRRRatingNormalOutOf"></div>
...: </div>
...: '''
In [3]: soup = BeautifulSoup(html)
In [4]: soup.find('img').get('title')
Out[4]: '3.4 out of 5'
In [5]: soup.find('img').get('alt')
Out[5]: '3.4 out of 5'
Upvotes: 5