Reputation: 131
I wrote a script to scrape the website Vivino, using the Beautiful Soup and Selenium libraries.
In this website, I want to store information of a certain wine's reviews.
I have to use Selenium to do dynamic scraping since the reviews can only be accessed pressing the "Show more reviews" button in the webpage, which appears after scrolling down to the top of the page.
I adapted the code for just one wine so you can see, if needed, how long it takes:
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
def scroll_to_bottom_wine_page(driver):
#driver = self.browser
scroll_pause_time = 0.01 #Change time?
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def scroll_to_bottom_review_page(driver, rating_count):
stuck_counter = 0
current_reviews_now = 0
current_reviews_previous = 0
scroll_review_pause_time = 0.8 #Change time?
stop_indicator = rating_count
time.sleep(scroll_review_pause_time)
element_inside_popup = driver.find_element_by_xpath('//*[@id="baseModal"]/div/div[2]/div[3]//a') #Reviews path
while True:
time.sleep(scroll_review_pause_time)
element_inside_popup.send_keys(Keys.END)
results_temp = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(results_temp, 'lxml')
reviews = soup.findAll("div", {"class": "card__card--2R5Wh reviewCard__reviewCard--pAEnA"})
current_reviews_now = len(reviews)
#In case there actually are less reviews than what the rating_count states, we avoid scrolling down forever
if(current_reviews_now == current_reviews_previous):
stuck_counter += 1
if (current_reviews_now > (stop_indicator)) or (stuck_counter > 2):
break
current_reviews_previous = current_reviews_now
return reviews
def get_reviews(wine_ids, wine_urls, rating_counts):
#Create a dataframe
review_info = pd.DataFrame()
#Create a driver
driver = webdriver.Chrome()
for wine_url in wine_urls:
#Pass URL to driver
driver.get(wine_url)
#We scroll down to the bottom of the wine webpage
scroll_to_bottom_wine_page(driver)
#Search for the "Show more reviews button and click it
wait = WebDriverWait(driver,40)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Show more reviews')))
more_reviews_button = driver.find_element_by_link_text('Show more reviews')
more_reviews_button.click()
#Scroll till we reach the number of reviews
reviews = scroll_to_bottom_review_page(driver, rating_counts)
length = len(reviews)
wine_ids_list = [wine_ids] * length
review_user_links = []
review_ratings = []
review_usernames = []
review_dates = []
review_texts = []
review_likes_count = []
review_comments_count = []
for review in reviews:
review_user_links.append([a['href'] for a in review.find_all('a', href=True)][0])
review_ratings.append(float((review.find("div", class_="rating__rating--ZZb_x")["aria-label"]).split()[1]))
review_usernames.append(str((review.find('a', {"class" : 'anchor__anchor--3DOSm reviewCard__userName--2KnRl'})).string))
review_dates.append("".join(((review.find('div', {"class" : 'reviewCard__ratingsText--1LU2T'})).text).rsplit((str(review_usernames[-1])))))
if (review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})) is not None:
review_texts.append(str((review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})).string))
review_texts = [item.strip() for item in review_texts]
else:
review_texts.append('None')
if (review.find("div", class_="likeButton__likeCount--82au4")) is not None:
review_likes_count.append(int(review.find("div", class_="likeButton__likeCount--82au4").text))
else:
review_likes_count.append(int(0))
if (review.find("div", class_="commentsButton__commentsCount--1_Ugm")) is not None:
review_comments_count.append(int(review.find("div", class_="commentsButton__commentsCount--1_Ugm").text))
else:
review_comments_count.append(int(0))
#We put the information in a dataframe
review_info_temp = pd.DataFrame()
review_info_temp.loc[:,'wine_id'] = wine_ids_list
review_info_temp.loc[:,'review_user_links'] = review_user_links
review_info_temp.loc[:,'review_ratings'] = review_ratings
review_info_temp.loc[:,'review_usernames'] = review_usernames
review_info_temp.loc[:,'review_dates'] = review_dates
review_info_temp.loc[:,'review_texts'] = review_texts
review_info_temp.loc[:,'review_likes_count'] = review_likes_count
review_info_temp.loc[:,'review_comments_count'] = review_comments_count
#We update the total dataframe
review_info = pd.concat([review_info,review_info_temp], axis=0, ignore_index=True)
#We close the driver
driver.quit()
return review_info
wine_id = ['123']
wine_url = ['https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981']
wine_rating_count = 186
start_time = time.time()
reviews_info = get_reviews(wine_id, wine_url, wine_rating_count)
elapsed_time = time.time() - start_time
print('The scrape took: ', elapsed_time) #For this particular wine, the code took 38 seconds to run
The script I wrote do the following steps:
With a certain wine link (ie: https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981), I can access that webpage with Selenium driver.
Then, I scroll down to the bottom of the web page.
I find and click the button "Show more reviews"
After pressing this button, a pop-up page appears with the wine reviews
I scroll down in these pop-up window until it reaches a certain amount of reviews
I extract the information I need from the reviews (each review is a Beautiful Soup's soup object)
The problem is that, if I want to scrape the reviews information of thousands of wines, it would take forever. For a single wine with 99 reviews, it takes 35 seconds to do this.
Is there any way I can speed up this process?
Upvotes: 2
Views: 3109
Reputation: 1758
My advice is don't use Selenium. Selenium should be your last option to scrape a web page. Instead, learn to understand how a web page make requests using your web browser developres tools. For example, for the web page you posted, this is the URL where you can retrieve the coments: https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1
They have an API!! Its very easy to scrape something like that.
You only need requests
and maybe BeautifulSoup.
headers = {"pragma": "no-cache",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"x-requested-with": "XMLHttpRequest"}
url = "https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1"
resp = requests.get(url, headers=headers)
resp.json()
The answer looks like:
{'reviews': [{'id': 118841527,
'rating': 5.0,
'note': 'You need to taste it!! ',
'language': 'en',
'created_at': '2019-02-16T15:33:49.000Z',
'aggregated': True,
'user': {'id': 10310349,
'seo_name': 'miguellourenco0',
'alias': 'Miguel Lourenço',
'is_featured': False,
'visibility': 'all',
'image': {'location': '//images.vivino.com/avatars/0064zilphklf01a4dd1d69f.jpg',
'variations': {'large': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_300x300.jpg',
'small_square': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_50x50.jpg'}},
'statistics': {'followers_count': 14,
'followings_count': 21,
'ratings_count': 113,
'ratings_sum': 0,
'reviews_count': 90},
'background_image': None},
Upvotes: 1
Reputation: 1285
Those reviews are from their api:
import requests
agent = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'}
response = requests.get('https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=100', headers=agent)
reviews = response.json()["reviews"]
print(reviews)
Upvotes: 0