toshi456
toshi456

Reputation: 223

Scraping Tripadvisor reviews of all pages of a particular hotel with Python and Selenium

I am using Python and Selenium to scrape tripadvisor all the reviews of a particular hotel and I am new to scraping. But currently it's scraping reviews from first 6 pages out of 36 pages. I need to scrape the reviews from all the pages in that hotel and save them into a csv file. Following is the code I'm using.

import csv
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver

driver = webdriver.Chrome("./chromedriver")


def check_exists_by_xpath(xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True
    time.sleep(2)


def getHotelReviews():
    # Find and click the More link (to load all reviews)
    driver.find_element_by_xpath("//span[@class='_33O9dg0j']").click()
    time.sleep(20)

    reviews = driver.find_elements_by_xpath("//div[@data-test-target='reviews-tab']/div")
    reviews_count = len(reviews)
    print(reviews_count)

    # Loop through the reviews found
    for i in range(2, reviews_count):

        try:

            if (check_exists_by_xpath(".//div[contains(@class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]")):
                moreBtn = reviews[i].find_element_by_xpath(
                    ".//div[contains(@class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]").click()
                time.sleep(20)
            if (check_exists_by_xpath(".//div[contains(@class,'_2f_ruteS _1bona3Pu')]/div/q/span")):
                review = reviews[i].find_element_by_xpath(
                    ".//div[contains(@class,'_2f_ruteS _1bona3Pu')]/div/q/span").text
                print(review)

            date = reviews[i].find_element_by_xpath(".//span[contains(@class,'_34Xs-BQm')]").text
            print(date)
            title = reviews[i].find_element_by_xpath(".//div[contains(@class,'glasR4aX')]/a/span").text
            print(title)
            # Save to CSV
            csvWriter.writerow((date, title, review))

        except:
            break

    driver.close()
    driver.switch_to.window(driver.window_handles[0])


def getHotelPages(url):
    driver.get(url)
    # to maximize the driver
    driver.maximize_window()

    nextPage = driver.find_elements_by_xpath("//a[contains(@class,'pageNum cx_brand_refresh_phase2 ')]")
    noOfPages = len(nextPage)
    print(noOfPages)

    for i in range(noOfPages):
        print(nextPage[i].get_attribute("href"))
        URLs.append(nextPage[i].get_attribute("href"))


URLs = [
    'https://www.tripadvisor.com/Hotel_Review-g304141-d3895228-Reviews-The_Hideout_Sigiriya-Sigiriya_Central_Province.html#REVIEWS']

# Prepare CSV file
csvFile = open("hideoutSigiriyab_reviews1.csv", "w", newline='', encoding="utf-8")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Date', 'Title', 'Review'])

try:
    getHotelPages(URLs[0])
except:
    print("Error!!")

time.sleep(60)

for url in URLs:
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(url)

    getHotelReviews()
    time.sleep(20)

csvFile.close()
driver.close()

Can you help me by suggesting a method or a working code to scrape the reviews from all the pages of a hotel.

Upvotes: 0

Views: 1860

Answers (1)

Arundeep Chohan
Arundeep Chohan

Reputation: 9969

Simple way to click pages 1-36.

size=int(driver.find_element_by_css_selector('div.pageNumbers >a:nth-last-child(1)').text)

for i in range(2,size):
    pageNums=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pageNumbers")))
    pageNums.find_element_by_xpath("//a[text()='{}']".format(i)).click()
    time.sleep(5)

Import

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC

Upvotes: 2

Related Questions