Austin Capobianco
Austin Capobianco

Reputation: 214

Why is Beautiful Soup not finding this element with multiple classes?

I'm trying to select an element that looks like <li class="result first_result"> using the query soup.find_all("li", {"class" : "first_result"})

The element is definitely on the page but it's not showing up when I run my script. I've also tried soup.find_all("li", {"class" : "result first_result"}) for the record, but still nothing.

What am I doing wrong?

edit: at alecxe's request I've posted the code I have so far. I'm on 64-bit Windows 7 using Python 3.4 which I'm sure is the culprit. The specific part I made this question for is at the very bottom under ###METACRITIC STUFF###

from bs4 import BeautifulSoup
from urllib3 import poolmanager
import csv
import requests
import sys
import os
import codecs
import re
import html5lib
import math
import time
from random import randint

connectBuilder = poolmanager.PoolManager()

inputstring = sys.argv[1]  #argv string MUST use double quotes

inputarray = re.split('\s+',inputstring)

##########################KAT STUFF########################


katstring = ""

for item in inputarray: katstring += (item + "+")
katstring=katstring[:-1]
#kataddress = "https://kat.cr/usearch/?q=" + katstring    #ALL kat
kataddress = "https://kat.cr/usearch/" + inputstring + " category:tv/?field=seeders&sorder=desc"    #JUST TV kat
#print(kataddress)
numSeedsArray = []
numLeechArray = []


r = requests.get(kataddress)
soup = BeautifulSoup(r.content, "html5lib")
totalpages = [h2.find('span') for h2 in soup.findAll('h2')][0].text #get a string that looks like 'house of cards results 1-25 from 178'
totalpages = int(totalpages[-4:]) #slice off everything but the total # of pages
totalpages = math.floor(totalpages/25)

#print("totalpages= "+str(totalpages))
iteration=0
savedpage = ""

def getdata(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    global numSeedsArray
    global numLeechArray


    tds = soup.findAll("td", { "class" : "green center" })
    numSeedsArray += [int(td.text) for td in tds]   
    tds = soup.findAll("td", { "class" : "red lasttd center"})
    numLeechArray += [int(td.text) for td in tds] 
    #print(numSeedsArray)



def getnextpage(url):
    global iteration
    global savedpage
    #print("url examined= "+url)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    nextpagelinks = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton" })
    nextpagelinks = [link.get('href') for link in nextpagelinks]
    #print(nextpagelinks)

    activepage = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton active" })
    #print("activepage= " +activepage[0].text)
    currentpagenum = activepage[0].text
    #print("currentpagenum= "+currentpagenum)
    if len(currentpagenum)==1 and iteration>1:
        nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
        #print("nextpage= "+nextpage)
        nextpage = re.sub(r'(%20)', ' ', nextpage)
        nextpage = re.sub(r'(%3A)', ':', nextpage)
        nextpage = "https://kat.cr"+nextpage
        #print(nextpage)
    elif len(currentpagenum)==1 and iteration<=1:
        nextpage = str(nextpagelinks[0][:-28])+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
        savedpage = str(nextpagelinks[0][:-28])
        #print("savedpage= "+savedpage )
        nextpage = re.sub(r'(%20)', ' ', nextpage)
        nextpage = re.sub(r'(%3A)', ':', nextpage)
        nextpage = "https://kat.cr"+nextpage
        #print(nextpage)
    elif len(currentpagenum)==2:
        nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
        #print("nextpage= "+nextpage)
        nextpage = re.sub(r'(%20)', ' ', nextpage)
        nextpage = re.sub(r'(%3A)', ':', nextpage)
        nextpage = "https://kat.cr"+nextpage
        #print(nextpage)

    return nextpage   



if totalpages<2:
    while iteration < totalpages-1: #should be totalpages-1 for max accuracy
        getdata(kataddress)
        iteration+=1
        kataddress = getnextpage(kataddress)
else:
    while iteration < 2: #should be totalpages-1 for max accuracy
        getdata(kataddress)
        iteration+=1
        kataddress = getnextpage(kataddress)
    # print(str(sum(numSeedsArray)))
    # print(str(sum(numLeechArray)))

print(str(sum(numLeechArray)+sum(numSeedsArray)))

def getgoogdata(title):
    title = re.sub(r' ', '+', title)
    url = 'https://www.google.com/search?q=' +title+ '&ie=utf-8&oe=utf-8'
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    resultnum = soup.find("div", {"id": "resultStats"}).text[:-14]

    s2 = resultnum.replace(',', '')
    resultnum = re.findall(r'\b\d+\b', s2)
    print(resultnum)

getgoogdata(inputstring)



####################METACRITIC STUFF#########################
metainputstring = ""
for item in inputarray:
    metainputstring += item + " "
metainputstring = metainputstring[:-1]
metacriticaddress = "http://www.metacritic.com/search/tv/" + metainputstring + "/results"

print (metacriticaddress)

r = requests.get(metacriticaddress)
soup = BeautifulSoup(r.content, "html5lib")
first_result = soup.find_all("li", attrs={"class" : "first_result"})

# first_result = soup.select("li.result.first_result")
print(first_result)

Upvotes: 1

Views: 1786

Answers (3)

alecxe
alecxe

Reputation: 473753

All other answers are not related to your actual problem.

You need to pretend to be a real browser to be able to see the search results:

r = requests.get(metacriticaddress, headers={
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})

Proof (searching for Game of Thrones, of course):

>>> from bs4 import BeautifulSoup
>>> 
>>> import requests
>>> 
>>> metacriticaddress = "http://www.metacritic.com/search/tv/game%20of%20thrones/results"
>>> r = requests.get(metacriticaddress, headers={
...     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
... })
>>> soup = BeautifulSoup(r.content, "html5lib")
>>> first_result = soup.find_all("li", class_="first_result")
>>> 
>>> print(first_result[0].find("h3", class_="product_title").get_text(strip=True))
Game of Thrones

Upvotes: 2

wpercy
wpercy

Reputation: 10090

Your first attempt (soup.find_all("li", {"class" : "first_result"})) is almost correct, but you need to specify the parameter that your dictionary is being passed to (in this case the parameter name is attrs), and call it like soup.find_all("li", attrs={"class" : "first_result"}).

However, I would suggest doing this with a CSS selector because you're matching against multiple classes. You can do this using the .select() method of the soup like this

results = soup.select("li.result.first_result")

Be aware that .select() will always return a list, so if there's only one element, don't forget to access it as results[0].

Upvotes: 1

jwodder
jwodder

Reputation: 57450

Quoting the documentation:

It’s very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, “class”, is a reserved word in Python. Using class as a keyword argument will give you a syntax error. As of Beautiful Soup 4.1.2, you can search by CSS class using the keyword argument class_

Thus, you need to instead write: soup.find_all("li", class_="first_result").

If you're using a pre-4.1.2 version of BeautifulSoup, or if you're insistent upon passing a dictionary, you need to specify that the dictionary fills the attrs parameter: soup.find_all("li", attrs={"class" : "first_result"}).

Upvotes: 2

Related Questions