Reputation: 13
I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
Traceback (most recent call last):
File "<pyshell#37>", line 4, in <module>
items = scrape(url)
File "<pyshell#13>", line 11, in scrape
items = parse(session, url + '?filterLang=' + lang)
File "<pyshell#18>", line 15, in parse
num_reviews = int(num_reviews) # convert text into integer
ValueError: invalid literal for int() with base 10: '5.695'
(where 5,695 is the number of reviews in the page)
I'm attaching the code here in case someone can help me.
Thank you so much! Silvia
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
Thanks to all the commenters. I realized the issue lied in the Italian and US paradigm for writing thousand separators (we use ".", whereas the americans use ",").
Upvotes: 1
Views: 735
Reputation: 216
You cannot be parsed directly to an integer value, In this case you first convert it into float then if you want convert it as Int.
num_reviews = int(float(num_reviews))
Upvotes: 0
Reputation: 2019
The error is due to the full stop in the int you are trying to convert. To make sure it works with all typing formats, you need to filter for numerical characters only before converting to int:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '').replace('.','')
num_reviews = int(num_reviews)
Or more in a more generic way, only include numerical chars in the string num_reviews
Upvotes: 0
Reputation: 950
You seem to have the following string for number of views 5.695
before trying to type cast it to int with num_reviews = int(num_reviews)
.
Probably the .
in 5.695
is a thousands separator.
So remove the .
like this before using int()
:
num_reviews = num_reviews.replace('.', '')
num_reviews = int(num_reviews)
Upvotes: 3