How can I make web scraping faster in Python using Selenium and Beautiful Soup?

Question

I wrote a script to scrape the website Vivino, using the Beautiful Soup and Selenium libraries.

In this website, I want to store information of a certain wine's reviews.

I have to use Selenium to do dynamic scraping since the reviews can only be accessed pressing the "Show more reviews" button in the webpage, which appears after scrolling down to the top of the page.

I adapted the code for just one wine so you can see, if needed, how long it takes:

import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd


def scroll_to_bottom_wine_page(driver):

    #driver = self.browser
    scroll_pause_time = 0.01 #Change time?
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
 
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 
        # Wait to load page
        time.sleep(scroll_pause_time)
 
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

        
def scroll_to_bottom_review_page(driver, rating_count):

    stuck_counter = 0
    current_reviews_now = 0
    current_reviews_previous = 0
    scroll_review_pause_time = 0.8 #Change time?
    stop_indicator = rating_count 
    
    time.sleep(scroll_review_pause_time)
    element_inside_popup = driver.find_element_by_xpath('//*[@id="baseModal"]/div/div[2]/div[3]//a')  #Reviews path



    while True:
        time.sleep(scroll_review_pause_time)
        element_inside_popup.send_keys(Keys.END)
        results_temp = driver.execute_script("return document.documentElement.outerHTML")
        soup = BeautifulSoup(results_temp, 'lxml')    
        reviews = soup.findAll("div", {"class": "card__card--2R5Wh reviewCard__reviewCard--pAEnA"})
        current_reviews_now = len(reviews)

        #In case there actually are less reviews than what the rating_count states, we avoid scrolling down forever
        if(current_reviews_now == current_reviews_previous):
            stuck_counter += 1
        
        if (current_reviews_now > (stop_indicator)) or (stuck_counter > 2):
            break
        
        current_reviews_previous = current_reviews_now
        
    return reviews



def get_reviews(wine_ids, wine_urls, rating_counts):

    #Create a dataframe
    review_info = pd.DataFrame()
   
    #Create a driver
    driver = webdriver.Chrome()
    
    for wine_url in wine_urls:

        #Pass URL to driver
        driver.get(wine_url)

        #We scroll down to the bottom of the wine webpage
        scroll_to_bottom_wine_page(driver)

        #Search for the "Show more reviews button and click it
        wait = WebDriverWait(driver,40)
        wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Show more reviews')))
        more_reviews_button = driver.find_element_by_link_text('Show more reviews')
        more_reviews_button.click()

        #Scroll till we reach the number of reviews 
        reviews = scroll_to_bottom_review_page(driver, rating_counts)
        length = len(reviews)
        
        wine_ids_list = [wine_ids] * length
        review_user_links = []
        review_ratings = []
        review_usernames = []
        review_dates = []
        review_texts = []
        review_likes_count = []
        review_comments_count = []

        for review in reviews:
            
            
            review_user_links.append([a['href'] for a in review.find_all('a', href=True)][0])
            review_ratings.append(float((review.find("div", class_="rating__rating--ZZb_x")["aria-label"]).split()[1]))
            review_usernames.append(str((review.find('a', {"class" : 'anchor__anchor--3DOSm reviewCard__userName--2KnRl'})).string))
            review_dates.append("".join(((review.find('div', {"class" : 'reviewCard__ratingsText--1LU2T'})).text).rsplit((str(review_usernames[-1])))))
            
            if (review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})) is not None:
                review_texts.append(str((review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})).string))
                review_texts = [item.strip() for item in review_texts]  
            else:
                review_texts.append('None')

            if (review.find("div", class_="likeButton__likeCount--82au4")) is not None:
                review_likes_count.append(int(review.find("div", class_="likeButton__likeCount--82au4").text))
            else:
                review_likes_count.append(int(0))

            if (review.find("div", class_="commentsButton__commentsCount--1_Ugm")) is not None:
                review_comments_count.append(int(review.find("div", class_="commentsButton__commentsCount--1_Ugm").text))
            else:
                review_comments_count.append(int(0))
                
        #We put the information in a dataframe
        review_info_temp = pd.DataFrame()
        
        review_info_temp.loc[:,'wine_id'] = wine_ids_list
        review_info_temp.loc[:,'review_user_links'] = review_user_links
        review_info_temp.loc[:,'review_ratings'] = review_ratings
        review_info_temp.loc[:,'review_usernames'] = review_usernames
        review_info_temp.loc[:,'review_dates'] = review_dates
        review_info_temp.loc[:,'review_texts'] = review_texts
        review_info_temp.loc[:,'review_likes_count'] = review_likes_count
        review_info_temp.loc[:,'review_comments_count'] = review_comments_count

        #We update the total dataframe
        review_info = pd.concat([review_info,review_info_temp], axis=0, ignore_index=True)
       
    #We close the driver
    driver.quit()
        
    return review_info


wine_id = ['123']
wine_url = ['https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981']
wine_rating_count = 186 

start_time = time.time()
reviews_info = get_reviews(wine_id, wine_url, wine_rating_count)
elapsed_time = time.time() - start_time
print('The scrape took: ', elapsed_time) #For this particular wine, the code took 38 seconds to run

The script I wrote do the following steps:

With a certain wine link (ie: https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981), I can access that webpage with Selenium driver.
Then, I scroll down to the bottom of the web page.
I find and click the button "Show more reviews"
After pressing this button, a pop-up page appears with the wine reviews
I scroll down in these pop-up window until it reaches a certain amount of reviews
I extract the information I need from the reviews (each review is a Beautiful Soup's soup object)

The problem is that, if I want to scrape the reviews information of thousands of wines, it would take forever. For a single wine with 99 reviews, it takes 35 seconds to do this.

Is there any way I can speed up this process?

jjsantoso · Accepted Answer

My advice is don't use Selenium. Selenium should be your last option to scrape a web page. Instead, learn to understand how a web page make requests using your web browser developres tools. For example, for the web page you posted, this is the URL where you can retrieve the coments: https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1

They have an API!! Its very easy to scrape something like that.

You only need requests and maybe BeautifulSoup.

headers = {"pragma": "no-cache",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"x-requested-with": "XMLHttpRequest"}

url = "https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1"

resp = requests.get(url, headers=headers)
resp.json()

The answer looks like:

{'reviews': [{'id': 118841527,
   'rating': 5.0,
   'note': 'You need to taste it!! ',
   'language': 'en',
   'created_at': '2019-02-16T15:33:49.000Z',
   'aggregated': True,
   'user': {'id': 10310349,
    'seo_name': 'miguellourenco0',
    'alias': 'Miguel Lourenço',
    'is_featured': False,
    'visibility': 'all',
    'image': {'location': '//images.vivino.com/avatars/0064zilphklf01a4dd1d69f.jpg',
     'variations': {'large': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_300x300.jpg',
      'small_square': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_50x50.jpg'}},
    'statistics': {'followers_count': 14,
     'followings_count': 21,
     'ratings_count': 113,
     'ratings_sum': 0,
     'reviews_count': 90},
    'background_image': None},

How can I make web scraping faster in Python using Selenium and Beautiful Soup?

Answers (2)

Related Questions