Reputation: 71
Here's the museum's reviews I would like to scrape : url
Here's my script so far :
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
PATH = "chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument('enable-logging')
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
url = 'https://www.google.com/maps/place/Mus%C3%A9e+Gr%C3%A9vin/@48.8718378,2.3400264,17z/data=!3m1!4b1!4m5!3m4!1s0x47e66e3e9be04a55:0x7def1a3ff98df458!8m2!3d48.8718157!4d2.3422113'
driver.get(url)
driver.find_element_by_xpath('//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span/span[2]/span[1]/button').click()
#to make sure content is fully loaded we can use time.sleep() after navigating to each page
import time
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
review = []
reviews = soup.find_all('span', class_='wiI7pd')
for container in reviews:
rev = container.text.strip()
review.append(rev)
#print(review)
data = pd.DataFrame({
'reviews' : review
})
data.to_csv("test_map.csv", sep=';', index=False, encoding='utf-8-sig')
driver.quit()
I read on this post that you don't need to scroll because all the web elements are already on the HTML source : How to scroll down google maps using selenium python
But in my output I just had 10 reviews.
So I would like to scroll the left panel but I don't figure it out how to do so. I tried many ways but cannot make it work.
Like this one :
review_box= lambda: self.browser.find_element_by_xpath("xpath to div")
review_box().click()
for i in range(19000):
driver.execute_script("window.scrollBy(0, 250)")
time.sleep(1)
Usually scrollBy
always worked for me but here it doesn't work.
EDIT : This worked for me at the moment :
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
number = 0
while True:
number = number+1
# Scroll down to bottom
ele = driver.find_element_by_xpath('//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
driver.execute_script('arguments[0].scrollBy(0, 5000);', ele)
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
print(f'last height: {last_height}')
ele = driver.find_element_by_xpath('//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
new_height = driver.execute_script("return arguments[0].scrollHeight", ele)
print(f'new height: {new_height}')
#if number == 30:
# break
if new_height == last_height:
break
print('cont')
last_height = new_height
Upvotes: 0
Views: 1224
Reputation: 120
You have to scroll the element, not the window. I did it like this::
public Object execJavaScript(String jsValue, Object object) {
JavascriptExecutor js = (JavascriptExecutor) getDriver();
return js.executeScript(jsValue, object);
}
private void scrollThroughChannels() {
WebElement channels =selenium.findbyCssSelector("div#channels");
long heigh = (long) selenium.execJavaScript("return arguments[0].scrollHeight", channels);
for(int x = 0; x < heigh; x = x + 200) {
execJavaScript("arguments[0].scrollTop = "+x, channels);
....
}
}
Upvotes: 1