Reputation: 117
I am trying to use urllib to grab a html page, then use beautifulsoup to extract data out. I want to get all the number from comments_42.html and print out the sum of them, then display the numbers of data. Here is my code, I am trying to use regex, but it doesn't work for me.
import urllib
from bs4 import BeautifulSoup
url = 'http://python-data.dr-chuck.net/comments_42.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
tags = soup('span')
for tag in tags:
print tag
Upvotes: 4
Views: 27153
Reputation: 662
import urllib.request
import re
from bs4 import BeautifulSoup
url = input('Enter: ')
tag = input("input the html tag to search: ")
parameter = input("Enter the html parameter of the tag for better selection (optional): ")
p_value = input("Enter the parameter value (optional): ")
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
if not parameter == "" and not p_value == "":
numbers = soup(tag, {parameter: p_value})
else:
numbers = soup(tag)
sumation = 0
for number in numbers:
n = str(number)
x = re.findall('([0-9]+)', n)
for item in x:
sumation += int(item)
print(sumation)
Tag
takes the html tag to search as inputParameter
takes html parameters like class
, id
etc.p_value
takes the class name or the id name as inputUpvotes: 0
Reputation: 13
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
lst = list()
sum = 0
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('span')
for tag in tags:
strtag = str(tag)
lst = re.findall('[0-9+]+',strtag)
sum = sum + int(lst[0])
print(sum)
Upvotes: 0
Reputation: 1
import urllib.request,urllib.parse,urllib.error
import re
from bs4 import BeautifulSoup
url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
tags=soup('span')
sum=0
for tag in tags:
x=re.findall("[0-9]+",tag)
for i in x:
z=int(i)
sum=sum+i
print(sum)
Upvotes: -1
Reputation: 1
I did this on curser and it gave me all the right answers. Hope it helps ;)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html,"html.parser")
# Retrieve all of the anchor tags
tags = soup('span')
sum = 0
count = 0
for tag in tags:
# Look at the parts of a tag
#print tag.contents[0]
num = float(tag.contents[0])
#print num
sum = sum + num
count = count + 1
print ('count:', count)
print ('sum:', sum)
Upvotes: 0
Reputation: 1
Do it the basic way…
# Retrieve all of the anchor tags
tags = soup('span')
sum = 0
count = 0
for tag in tags:
# Look at the parts of a tag
#print tag.contents[0]
num = float(tag.contents[0])
#print num
sum = sum + num
count = count + 1
print 'count:',count
print 'sum:',sum
Upvotes: 0
Reputation: 157
I am taking the same course from Coursera as you are. Instead of going for the above solutions, do you mind trying this one. I feel this one is within the scope of what we had learnt till the above mentioned problem. It absolutely worked for me.
import urllib
import re
from bs4 import *
url = 'http://python-data.dr-chuck.net/comments_216543.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
sum=0
# Retrieve all of the anchor tags
tags = soup('span')
for tag in tags:
# Look at the parts of a tag
y=str(tag)
x= re.findall("[0-9]+",y)
for i in x:
i=int(i)
sum=sum+i
print sum
Upvotes: 2
Reputation: 71
Don't forget that you have to import regular expressions in order to use them in the code.
import re
Upvotes: 0
Reputation: 13570
@Learner's solution is completely right ! but if you want to do more with names and comments , you can do this which returns list of names and comments :
from BeautifulSoup import BeautifulSoup
import re
import urllib
url = 'http://python-data.dr-chuck.net/comments_42.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
all = soup.findAll('span',{'class':'comments'},text=re.compile(r'[0-9]{0,4}')) #use regex to extract only numbers
cleaned = filter(lambda x: x!=u'\n',all)[4:]
In [18]: cleaned
Out[18]:
[u'Leven',
u'100',
u'Mahdiya',
u'97',
u'Ajayraj',
u'87',
u'Lillian',
u'86',
u'Aon',
u'86',
u'Ruaraidh',
u'78',
u'Gursees',
u'75',
u'Emmanuel',
u'74',
u'Christy',
u'72',
u'Annoushka',
u'72',
u'Inara',
u'72',
u'Caite',
u'70',
u'Rosangel',
u'70',
u'Iana',
u'66',
u'Anise',
u'66',
u'Jaosha',
u'65',
u'Cadyn',
u'65',
u'Edward',
u'63',
u'Charlotte',
u'61',
u'Sammy',
u'60',
u'Zarran',
u'60',.....] #
Upvotes: 0
Reputation: 639
Use findAll() method of BeautifulSoup to extract all span tags with class 'comments', since they contain the information you need. You can then perform any operation on them depending on your requirements.
soup = BeautifulSoup(html,"html.parser")
data = soup.findAll("span", { "class":"comments" })
numbers = [d.text for d in data]
Here is the output:
[u'100', u'97', u'87', u'86', u'86', u'78', u'75', u'74', u'72', u'72', u'72', u'70', u'70', u'66', u'66', u'65', u'65', u'63', u'61', u'60', u'60', u'59', u'59', u'57', u'56', u'54', u'52', u'52', u'51', u'47', u'47', u'41', u'41', u'41', u'38', u'35', u'32', u'31', u'24', u'19', u'19', u'18', u'17', u'16', u'13', u'8', u'7', u'1', u'1', u'1']
Upvotes: 8