Reputation: 779
I am trying to scrape multiple pages with selenium but they will scrape only 1 page what mistake I will do is there any solution then provide us this is the page link https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?qvrtqca=&filters%5Brechtsgebieden%5D=%5B%5D&ypb=&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D=%7B%22lat%22%3A%2252.132633%22%2C%22lng%22%3A%225.291266%22%7D&locatie%5Bstraal%5D=56&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Bhash%5D='
driver.get(URL)
time.sleep(3)
page=1
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]
data=[]
for link in page_links:
wev={}
driver.get(link)
time.sleep(2)
try:
title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
except:
pass
wev['title']=title
try:
advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
except:
pass
wev['advocaten']=advocaten
details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
for detail in details:
try:
address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
except:
pass
wev['address']=address
try:
email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['email']=email
try:
website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['website']=website
data.append(wev)
if len(driver.find_elements_by_xpath("//a[@class='button next']")) > 0:
url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={}".format(page)
driver.get(url)
page += 1
if int(page)>5:
break
else:
break
df=pd.DataFrame(data)
print(df)
Upvotes: 1
Views: 414
Reputation: 16187
You can make the pagination in starting url using for loop
as follows:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options)
data=[]
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
for page in range(1,11):
driver.get(URL.format(page=page))
time.sleep(3)
page_links = [element.get_attribute('href') for element in driver.find_elements(By.XPATH, "//span[@class='h4 no-margin-bottom']//a")]
for link in page_links:
wev={}
driver.get(link)
time.sleep(2)
try:
title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
except:
pass
wev['title']=title
try:
advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
except:
pass
wev['advocaten']=advocaten
details=driver.find_elements(By.XPATH,"//section[@class='lawyer-info']")
for detail in details:
try:
address=detail.find_element_by_xpath("//div[@class='column medium-6']").text.strip()
except:
pass
wev['address']=address
try:
email=detail.find_element(By.XPATH, "//div[@class='row'][3]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['email']=email
try:
website=detail.find_element(By.XPATH, "//div[@class='row'][4]//div[@class='column small-9']//a").get_attribute('href')
except:
pass
wev['website']=website
data.append(wev)
df=pd.DataFrame(data)
print(df)
You also can try:
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
for page in range(1,11):
url=URL.format(page=page)
driver.get(url)
Upvotes: 1