Sennheiser
Sennheiser

Reputation: 13

ValueError when scraping Tripadvisor for reviews with BeautifulSoup

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.

I'm using code from Susanli2016.

It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)

I'm receiving the error:

        Traceback (most recent call last):
      File "<pyshell#37>", line 4, in <module>
        items = scrape(url)
      File "<pyshell#13>", line 11, in scrape
        items = parse(session, url + '?filterLang=' + lang)
      File "<pyshell#18>", line 15, in parse
        num_reviews = int(num_reviews) # convert text into integer
    ValueError: invalid literal for int() with base 10: '5.695'

(where 5,695 is the number of reviews in the page)

I'm attaching the code here in case someone can help me.

Thank you so much! Silvia

--

Hereby the complete code:

import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
    with open(filename, 'wb') as f:
         f.write(content)
         webbrowser.open(filename)


def get_soup(session, url, show=False):
    r = session.get(url)
    if show:
        display(r.content, 'temp.html')
    if r.status_code != 200: # not OK
        print('[get_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')


def post_soup(session, url, params, show=False):
    '''Read HTML from server and convert to Soup'''

    r = session.post(url, data=params)

    if show:
        display(r.content, 'temp.html')

    if r.status_code != 200: # not OK
        print('[post_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')

def scrape(url, lang='ALL'):

    # create session to keep all cookies (etc.) between requests
    session = requests.Session()

    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
    })


    items = parse(session, url + '?filterLang=' + lang)

    return items

 def parse(session, url):
    '''Get number of reviews and start getting subpages with reviews'''

    print('[parse] url:', url)

    soup = get_soup(session, url)

    if not soup:
        print('[parse] no soup:', url)
        return

    num_reviews = soup.find('span', class_='reviews_header_count').text # get text
    num_reviews = num_reviews[1:-1] 
    num_reviews = num_reviews.replace(',', '')
    num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)

    url_template = url.replace('.html', '-or{}.html')
    print('[parse] url_template:', url_template)

    items = []

    offset = 0

    while(True):
        subpage_url = url_template.format(offset)

        subpage_items = parse_reviews(session, subpage_url)
        if not subpage_items:
            break

        items += subpage_items

        if len(subpage_items) < 5:
            break

        offset += 5

    return items

 def get_reviews_ids(soup):

    items = soup.find_all('div', attrs={'data-reviewid': True})

    if items:
        reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
        print('[get_reviews_ids] data-reviewid:', reviews_ids)
        return reviews_ids

def get_more(session, reviews_ids):

    url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'

    payload = {
        'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
        #'contextChoice': 'DETAIL_HR', # ???
        'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
        'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
        'haveCsses': 'apg-Hotel_Review-in',
        'Action': 'install',
    }

    soup = post_soup(session, url, payload)

   return soup

 def parse_reviews(session, url):
    '''Get all reviews from one page'''

    print('[parse_reviews] url:', url)

    soup =  get_soup(session, url)

    if not soup:
        print('[parse_reviews] no soup:', url)
        return

   hotel_name = soup.find('h1', id='HEADING').text

    reviews_ids = get_reviews_ids(soup)
    if not reviews_ids:
        return

    soup = get_more(session, reviews_ids)

    if not soup:
        print('[parse_reviews] no soup:', url)
        return

    items = []

    for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):

        badgets = review.find_all('span', class_='badgetext')
        if len(badgets) > 0:
            contributions = badgets[0].text
        else:
            contributions = '0'

        if len(badgets) > 1:
            helpful_vote = badgets[1].text
        else:
            helpful_vote = '0'
        user_loc = review.select_one('div.userLoc strong')
        if user_loc:
            user_loc = user_loc.text
        else:
            user_loc = ''

        bubble_rating = review.select_one('span.ui_bubble_rating')['class']
        bubble_rating = bubble_rating[1].split('_')[-1]

        item = {
            'review_body': review.find('p', class_='partial_entry').text,
            'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
       }

        items.append(item)
        print('\n--- review ---\n')
        for key,val in item.items():
            print(' ', key, ':', val)

    print()

    return items

def write_in_csv(items, filename='results.csv',
                  headers=['hotel name', 'review title', 'review body',
                           'review date', 'contributions', 'helpful vote',
                           'user name' , 'user location', 'rating'],
              mode='w'):

    print('--- CSV ---')

    with io.open(filename, mode, encoding="utf-8") as csvfile:
        csv_file = csv.DictWriter(csvfile, headers)

        if mode == 'w':
            csv_file.writeheader()

        csv_file.writerows(items)


 DB_COLUMN   = 'review_body'
 DB_COLUMN1 = 'review_date'
 start_urls = [
    'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]

headers = [ 
    DB_COLUMN, 
    DB_COLUMN1, 
]

 lang = 'it'


 for url in start_urls:

    # get all reviews for 'url' and 'lang'
    items = scrape(url)

    if not items:
        print('No reviews')
    else:
        # write in CSV
        filename = url.split('Reviews-')[1][:-5]
        print('filename:', filename)
        write_in_csv(items, filename + '.csv', headers, mode='w')

Thanks to all the commenters. I realized the issue lied in the Italian and US paradigm for writing thousand separators (we use ".", whereas the americans use ",").

Upvotes: 1

Views: 735

Answers (3)

Ravi
Ravi

Reputation: 216

You cannot be parsed directly to an integer value, In this case you first convert it into float then if you want convert it as Int.

num_reviews = int(float(num_reviews))

Upvotes: 0

nickyfot
nickyfot

Reputation: 2019

The error is due to the full stop in the int you are trying to convert. To make sure it works with all typing formats, you need to filter for numerical characters only before converting to int:

num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1] 
num_reviews = num_reviews.replace(',', '').replace('.','')
num_reviews = int(num_reviews)

Or more in a more generic way, only include numerical chars in the string num_reviews

Upvotes: 0

Cvetan Mihaylov
Cvetan Mihaylov

Reputation: 950

You seem to have the following string for number of views 5.695 before trying to type cast it to int with num_reviews = int(num_reviews).

Probably the . in 5.695 is a thousands separator.

So remove the . like this before using int():

num_reviews = num_reviews.replace('.', '')
num_reviews = int(num_reviews)

Upvotes: 3

Related Questions