littlejiver
littlejiver

Reputation: 255

getting the wrong text from web scrape with beautifulsoup

I'm getting the wrong text when I scrape this url:

http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018

this is what I have

from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd

#Define year
year_number = 2018

# Define the URL
i = range(0, 1)

names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []

for element in i:

    url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)

    print(url)

    year_number -= 1

    # not sure about this but it works (I was getting blocked by something and this the way I found around it)
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

    web_byte = urlopen(req).read()

    webpage = web_byte.decode('utf-8')

    #this grabs the all the text from the page
    html_soup = BeautifulSoup(webpage, 'html5lib')

    #this is for selecting all the games in from 1 to 100 (the list of them)
    game_names = html_soup.find_all("div", class_="main_stats")
    game_metas = html_soup.find_all("a", class_="basic_stat product_score")  
    game_users = html_soup.find_all("li", class_='stat product_avguserscore')
    game_releases = html_soup.find_all("ul", class_='more_stats')
#     game_publishers = html_soup.find_all("ul", class_='more_stats')
#     game_ratings = html_soup.find_all("ul", class_='more_stats')
#     game_genres = html_soup.find_all("ul", class_='more_stats')



    #Extract data from each game
    for games in game_names:
        name = games.find()
        names.append(name.text.strip())

    for games2 in game_metas:
        metascore = games2.find()
        metascores.append(metascore.text.strip())  

    for games3 in game_releases:
        release_date = games3.find()
        release_dates.append(release_date.text.strip())

    for games4 in game_users:
        game_user = games4.find()
        userscores.append(game_user.text.strip())


#         print(name)
#         print(metascore)
#         print(userscore)

# for i in userscores:
#     temp = str(i)
#     temp2 = temp.replace("User:\n    ", "")
#     userscoresNew.append(temp2)

for x in release_dates:
    temp = str(x)
    temp2 = temp.replace("Release Date:\n                        ", "")
    release_datesNew.append(temp2)


# df = pd.DataFrame({'Games:': names,
#                     'Metascore:': metascores,
#                     'Userscore:': userscoresNew}) 

# df.to_csv("metacritic scrape.csv")

the above is looking for the user score but I get the text "User Score:" repeated 100x when what I want is the data in the next set of tags however, when I try to change the above variable to:

 game_users = html_soup.find_all("span", class_='data textscore textscore_favorable')

I get an error when I run the code:

AttributeError: 'NoneType' object has no attribute 'text'

also I don't think the 2nd option is a good approach because when the user score falls below a certain level the class changes on the HTML (from "data textscore textscore_favorable" to "data textscore textscore_mixed")

any help would be appreicated

FYI I modifying code I have already written but grabing more details from a more detailed view

Upvotes: 0

Views: 417

Answers (1)

Rakesh
Rakesh

Reputation: 82785

This should help.

import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018"
html = requests.get(url, headers=headers)
html_soup = BeautifulSoup(html.text, "html.parser")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
for i in game_users:
    userScore = i.find('span', class_="data textscore textscore_favorable")
    if userScore:
        print(userScore.text)

Output:

7.6
7.8
8.2
7.8
8.1
8.5
7.5
7.5
....
  • Use html_soup.find_all("li", class_='stat product_avguserscore') to get score

Upvotes: 1

Related Questions