Reputation: 214
I'm trying to select an element that looks like <li class="result first_result">
using the query soup.find_all("li", {"class" : "first_result"})
The element is definitely on the page but it's not showing up when I run my script. I've also tried soup.find_all("li", {"class" : "result first_result"})
for the record, but still nothing.
What am I doing wrong?
edit: at alecxe's request I've posted the code I have so far. I'm on 64-bit Windows 7 using Python 3.4 which I'm sure is the culprit. The specific part I made this question for is at the very bottom under ###METACRITIC STUFF###
from bs4 import BeautifulSoup
from urllib3 import poolmanager
import csv
import requests
import sys
import os
import codecs
import re
import html5lib
import math
import time
from random import randint
connectBuilder = poolmanager.PoolManager()
inputstring = sys.argv[1] #argv string MUST use double quotes
inputarray = re.split('\s+',inputstring)
##########################KAT STUFF########################
katstring = ""
for item in inputarray: katstring += (item + "+")
katstring=katstring[:-1]
#kataddress = "https://kat.cr/usearch/?q=" + katstring #ALL kat
kataddress = "https://kat.cr/usearch/" + inputstring + " category:tv/?field=seeders&sorder=desc" #JUST TV kat
#print(kataddress)
numSeedsArray = []
numLeechArray = []
r = requests.get(kataddress)
soup = BeautifulSoup(r.content, "html5lib")
totalpages = [h2.find('span') for h2 in soup.findAll('h2')][0].text #get a string that looks like 'house of cards results 1-25 from 178'
totalpages = int(totalpages[-4:]) #slice off everything but the total # of pages
totalpages = math.floor(totalpages/25)
#print("totalpages= "+str(totalpages))
iteration=0
savedpage = ""
def getdata(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
global numSeedsArray
global numLeechArray
tds = soup.findAll("td", { "class" : "green center" })
numSeedsArray += [int(td.text) for td in tds]
tds = soup.findAll("td", { "class" : "red lasttd center"})
numLeechArray += [int(td.text) for td in tds]
#print(numSeedsArray)
def getnextpage(url):
global iteration
global savedpage
#print("url examined= "+url)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
nextpagelinks = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton" })
nextpagelinks = [link.get('href') for link in nextpagelinks]
#print(nextpagelinks)
activepage = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton active" })
#print("activepage= " +activepage[0].text)
currentpagenum = activepage[0].text
#print("currentpagenum= "+currentpagenum)
if len(currentpagenum)==1 and iteration>1:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==1 and iteration<=1:
nextpage = str(nextpagelinks[0][:-28])+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
savedpage = str(nextpagelinks[0][:-28])
#print("savedpage= "+savedpage )
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==2:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
return nextpage
if totalpages<2:
while iteration < totalpages-1: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
else:
while iteration < 2: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
# print(str(sum(numSeedsArray)))
# print(str(sum(numLeechArray)))
print(str(sum(numLeechArray)+sum(numSeedsArray)))
def getgoogdata(title):
title = re.sub(r' ', '+', title)
url = 'https://www.google.com/search?q=' +title+ '&ie=utf-8&oe=utf-8'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
resultnum = soup.find("div", {"id": "resultStats"}).text[:-14]
s2 = resultnum.replace(',', '')
resultnum = re.findall(r'\b\d+\b', s2)
print(resultnum)
getgoogdata(inputstring)
####################METACRITIC STUFF#########################
metainputstring = ""
for item in inputarray:
metainputstring += item + " "
metainputstring = metainputstring[:-1]
metacriticaddress = "http://www.metacritic.com/search/tv/" + metainputstring + "/results"
print (metacriticaddress)
r = requests.get(metacriticaddress)
soup = BeautifulSoup(r.content, "html5lib")
first_result = soup.find_all("li", attrs={"class" : "first_result"})
# first_result = soup.select("li.result.first_result")
print(first_result)
Upvotes: 1
Views: 1786
Reputation: 473753
All other answers are not related to your actual problem.
You need to pretend to be a real browser to be able to see the search results:
r = requests.get(metacriticaddress, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
Proof (searching for Game of Thrones, of course):
>>> from bs4 import BeautifulSoup
>>>
>>> import requests
>>>
>>> metacriticaddress = "http://www.metacritic.com/search/tv/game%20of%20thrones/results"
>>> r = requests.get(metacriticaddress, headers={
... "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
... })
>>> soup = BeautifulSoup(r.content, "html5lib")
>>> first_result = soup.find_all("li", class_="first_result")
>>>
>>> print(first_result[0].find("h3", class_="product_title").get_text(strip=True))
Game of Thrones
Upvotes: 2
Reputation: 10090
Your first attempt (soup.find_all("li", {"class" : "first_result"})
) is almost correct, but you need to specify the parameter that your dictionary is being passed to (in this case the parameter name is attrs
), and call it like soup.find_all("li", attrs={"class" : "first_result"})
.
However, I would suggest doing this with a CSS selector because you're matching against multiple classes. You can do this using the .select()
method of the soup like this
results = soup.select("li.result.first_result")
Be aware that .select()
will always return a list, so if there's only one element, don't forget to access it as results[0]
.
Upvotes: 1
Reputation: 57450
It’s very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, “class”, is a reserved word in Python. Using
class
as a keyword argument will give you a syntax error. As of Beautiful Soup 4.1.2, you can search by CSS class using the keyword argumentclass_
Thus, you need to instead write: soup.find_all("li", class_="first_result")
.
If you're using a pre-4.1.2 version of BeautifulSoup, or if you're insistent upon passing a dictionary, you need to specify that the dictionary fills the attrs
parameter: soup.find_all("li", attrs={"class" : "first_result"})
.
Upvotes: 2