Reputation: 18705
I'm trying to scrape some data from flight search page.
This page works this way:
You fill in a form and then you click on the button search - this is ok. When you click the button you are redirected to the page with results and here is the problem. This page is adding continuously results for example for one minute which is not a big deal - problem is to get all of these results. When you are in real browser, you have to scroll down the page and these results are appearing. So I've tried to scroll down using Selenium. It scrolls down at the bottom of the page probably so fast or it is a jump instead of scrolling that the page doesn't load any new results.
When you are scrolling down slowly, it reloads results but if you do it very quickly it stops loading.
I'm not sure if my code helps to understand that so I'm attaching it.
SEARCH_STRING = """URL"""
class spider():
def __init__(self):
self.driver = webdriver.Firefox()
@staticmethod
def prepare_get(dep_airport,arr_airport,dep_date,arr_date):
string = SEARCH_STRING%(dep_airport,arr_airport,arr_airport,dep_airport,dep_date,arr_date)
return string
def find_flights_html(self,dep_airport, arr_airport, dep_date, arr_date):
if isinstance(dep_airport, list):
airports_string = str(r'%20').join(dep_airport)
dep_airport = airports_string
wait = WebDriverWait(self.driver, 60) # wait for results
self.driver.get(spider.prepare_get(dep_airport, arr_airport, dep_date, arr_date))
wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
wait.until(EC.invisibility_of_element_located((By.XPATH, u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
self.driver.find_element_by_xpath('//body').send_keys(Keys.CONTROL+Keys.END)
return self.driver.page_source
@staticmethod
def get_info_from_borderbox(div):
arrival = div.find('div',class_='departure').text
price = div.find('div',class_='pricebox').find('div',class_=re.compile('price'))
departure = div.find_all('div',class_='departure')[1].contents
date_departure = departure[1].text
airport_departure = departure[5].text
arrival = div.find_all('div', class_= 'arrival')[0].contents
date_arrival = arrival[1].text
airport_arrival = arrival[3].text[1:]
print 'DEPARTURE: '
print date_departure,airport_departure
print 'ARRIVAL: '
print date_arrival,airport_arrival
@staticmethod
def get_flights_from_result_page(html):
def match_tag(tag, classes):
return (tag.name == 'div'
and 'class' in tag.attrs
and all([c in tag['class'] for c in classes]))
soup = mLib.getSoup_html(html)
divs = soup.find_all(lambda t: match_tag(t, ['borderbox', 'flightbox', 'p2']))
for div in divs:
spider.get_info_from_borderbox(div)
print len(divs)
spider_inst = spider()
print spider.get_flights_from_result_page(spider_inst.find_flights_html(['BTS','BRU','PAR'], 'MAD', '2015-07-15', '2015-08-15'))
So the main problem is in my opinion that it scrolls too fast to trigger new loading of the results.
Have you any idea how to make it work?
Upvotes: 12
Views: 28364
Reputation: 1
Found a very easy (maybe too easy) solution for my project:
links = driver.find_elements("xpath", "//a[@href]")
for link in links:
# Scroll to the target div element
driver.execute_script("arguments[0].scrollIntoView();", link)
Insert this into the loop you use to get through the page (link in links in my code), and it will scroll the current div into view as you loop on down.
Upvotes: 0
Reputation: 161
I needed it for the same issue , i needed to scrape a social media website
y = 1000
for timer in range(0,50):
driver.execute_script("window.scrollTo(0, "+str(y)+")")
y += 1000
time.sleep(1)
the sleep every 1000 is to allow to load
Upvotes: 12
Reputation: 636
After some experiments, finally I found a good solution:
def __scroll_down_page(self, speed=8):
current_scroll_position, new_height= 0, 1
while current_scroll_position <= new_height:
current_scroll_position += speed
self.__driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
new_height = self.__driver.execute_script("return document.body.scrollHeight")
Upvotes: 11
Reputation: 169
time.sleep() make the program slower not good for production
This is a more efficient and controlled way to scroll down to the bottom.
Use the below function written by me.
if you increase the increment value a+=5 scroll speed become higher (Warning Not more than new_height) and vice versa
def pageBottom(driver):
bottom=False
a=0
while not bottom:
new_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script(f"window.scrollTo(0, {a});")
if a > new_height:
bottom=True
a+=5
Example Usage:
service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get("https://docs.scrapy.org/en/latest/intro/tutorial.html")
pageBottom(driver) #<---Go to Bottom
Upvotes: 3
Reputation: 49
Assuming the page is being lazy loaded, jumping to points iteratively with a window.scrollTo()
could possibly leave out elements that fall out of the range. Also, assuming the height of the page was dynamic pertaining to the content loaded, items that haven't been loaded yet could be left out. So I opted for window.scrollBy()
instead:
height = driver.execute_script("return document.body.scrollHeight")
for i in range(height):
driver.execute_script('window.scrollBy(0,20)') # scroll by 20 on each iteration
height = driver.execute_script("return document.body.scrollHeight") # reset height to the new height after scroll-triggered elements have been loaded.
In my case, a new batch of elements was loaded after scrolling to the 30th, giving the page a new height I needed to account for.
Upvotes: 1
Reputation: 688
@Test
public void clickMeButton() {
WebElement clickMeButton = driver.findElement(By.name("et_builder_submit_button"));
// softly scroll to the element group
JavascriptExecutor js = (JavascriptExecutor) driver;
for (int i = 0; i < 800; i += 7) {
js.executeScript("window.scrollTo(0, " + i + ")");
}
System.out.println("selected button");
clickMeButton.click();
System.out.println("clicked");
}
Upvotes: 1
Reputation: 21201
In Python Selenium, get Y position of your element, and then slowly scroll down.
y = driver.execute_script("return document.querySelector('YOUR-CSS-SELECTOR').getBoundingClientRect()['y']")
for x in range(0, int(y), 100):
driver.execute_script("window.scrollTo(0, "+str(x)+");")
Upvotes: 0
Reputation: 19
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://en.wikipedia.org")
height = browser.execute_script("return document.body.scrollHeight")
for scrol in range(100,height,100):
browser.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.1)
It's worked for me. If you want to scroll the page till the end to visible all the page elements may it's valuable for you. if you want to increase the scroll speed change the scroll speed just change 100 to 200.
Upvotes: 0
Reputation: 13047
You can do smooth scrolling using Selenium like below:
total_height = int(driver.execute_script("return document.body.scrollHeight"))
for i in range(1, total_height, 5):
driver.execute_script("window.scrollTo(0, {});".format(i))
Upvotes: 8
Reputation: 473763
Here is a different approach that worked for me involving scrolling into view of the last search result and waiting for additional elements to load before scrolling again:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
count = len(EC._find_elements(driver, self.locator))
return count >= self.count
except StaleElementReferenceException:
return False
driver = webdriver.Firefox()
dep_airport = ['BTS', 'BRU', 'PAR']
arr_airport = 'MAD'
dep_date = '2015-07-15'
arr_date = '2015-08-15'
airports_string = str(r'%20').join(dep_airport)
dep_airport = airports_string
url = "https://www.pelikan.sk/sk/flights/list?dfc=C%s&dtc=C%s&rfc=C%s&rtc=C%s&dd=%s&rd=%s&px=1000&ns=0&prc=&rng=1&rbd=0&ct=0" % (dep_airport, arr_airport, arr_airport, dep_airport, dep_date, arr_date)
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 60)
wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]')))
wait.until(EC.invisibility_of_element_located((By.XPATH,
u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img')))
while True: # TODO: make the endless loop end
results = driver.find_elements_by_css_selector("div.flightbox")
print "Results count: %d" % len(results)
# scroll to the last element
driver.execute_script("arguments[0].scrollIntoView();", results[-1])
# wait for more results to load
wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, 'div.flightbox'), len(results)))
Notes:
len(results)
value wait_for_more_than_n_elements
is a custom Expected Condition which helps to identify when the next portion is loaded and we can scroll againUpvotes: 2