Reputation: 223
I am using Python and Selenium to scrape tripadvisor all the reviews of a particular hotel and I am new to scraping. But currently it's scraping reviews from first 6 pages out of 36 pages. I need to scrape the reviews from all the pages in that hotel and save them into a csv file. Following is the code I'm using.
import csv
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
driver = webdriver.Chrome("./chromedriver")
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
time.sleep(2)
def getHotelReviews():
# Find and click the More link (to load all reviews)
driver.find_element_by_xpath("//span[@class='_33O9dg0j']").click()
time.sleep(20)
reviews = driver.find_elements_by_xpath("//div[@data-test-target='reviews-tab']/div")
reviews_count = len(reviews)
print(reviews_count)
# Loop through the reviews found
for i in range(2, reviews_count):
try:
if (check_exists_by_xpath(".//div[contains(@class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]")):
moreBtn = reviews[i].find_element_by_xpath(
".//div[contains(@class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]").click()
time.sleep(20)
if (check_exists_by_xpath(".//div[contains(@class,'_2f_ruteS _1bona3Pu')]/div/q/span")):
review = reviews[i].find_element_by_xpath(
".//div[contains(@class,'_2f_ruteS _1bona3Pu')]/div/q/span").text
print(review)
date = reviews[i].find_element_by_xpath(".//span[contains(@class,'_34Xs-BQm')]").text
print(date)
title = reviews[i].find_element_by_xpath(".//div[contains(@class,'glasR4aX')]/a/span").text
print(title)
# Save to CSV
csvWriter.writerow((date, title, review))
except:
break
driver.close()
driver.switch_to.window(driver.window_handles[0])
def getHotelPages(url):
driver.get(url)
# to maximize the driver
driver.maximize_window()
nextPage = driver.find_elements_by_xpath("//a[contains(@class,'pageNum cx_brand_refresh_phase2 ')]")
noOfPages = len(nextPage)
print(noOfPages)
for i in range(noOfPages):
print(nextPage[i].get_attribute("href"))
URLs.append(nextPage[i].get_attribute("href"))
URLs = [
'https://www.tripadvisor.com/Hotel_Review-g304141-d3895228-Reviews-The_Hideout_Sigiriya-Sigiriya_Central_Province.html#REVIEWS']
# Prepare CSV file
csvFile = open("hideoutSigiriyab_reviews1.csv", "w", newline='', encoding="utf-8")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Date', 'Title', 'Review'])
try:
getHotelPages(URLs[0])
except:
print("Error!!")
time.sleep(60)
for url in URLs:
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(url)
getHotelReviews()
time.sleep(20)
csvFile.close()
driver.close()
Can you help me by suggesting a method or a working code to scrape the reviews from all the pages of a hotel.
Upvotes: 0
Views: 1860
Reputation: 9969
Simple way to click pages 1-36.
size=int(driver.find_element_by_css_selector('div.pageNumbers >a:nth-last-child(1)').text)
for i in range(2,size):
pageNums=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pageNumbers")))
pageNums.find_element_by_xpath("//a[text()='{}']".format(i)).click()
time.sleep(5)
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Upvotes: 2