Reputation: 23
I was trying to scrap data from a web page. This is the LINK. I have successfully filled the selection form with selenium by this data and clicked the button marked with red line. (given in picture)
after that I got another page like this.
I also successfully clicked the first link from the list! But then comes the problem! The next page generated and I tried to scrap the company name and address with email but the page data can't be fetched with Selenium!
The page look like this and I am trying to scrap red marked data
I am giving my code. Can anyone please tell me what was my mistake?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import csv
import time
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get('https://www.outils.ffbatiment.fr/federation-francaise-du-batiment/laffb/annuaire.html')
driver.implicitly_wait(1)
selectOne = Select(driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_SectAct"]'))
selectOne.select_by_value('1')
driver.implicitly_wait(2)
selectTwo = Select(driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Act"]'))
selectTwo.select_by_value('704')
driver.implicitly_wait(2)
selectThree = Select(driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Departement"]'))
selectThree.select_by_value('85')
driver.implicitly_wait(2)
button = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_Button1"]')
button.click()
driver.implicitly_wait(2)
getData = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00"]/tbody')
getTr = getData.find_elements(By.TAG_NAME, "tr")
print("size ", len(getTr))
length = len(getTr)
for i in range(length) :
getData = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00"]/tbody')
getTr = getData.find_elements(By.TAG_NAME, "tr")
for item in getTr :
xxpath = '//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00__'+str(i)+'"]/td[1]/a'
link = item.find_elements(By.XPATH,xxpath)
link[0].send_keys(Keys.RETURN)
driver.implicitly_wait(10)
company = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_NomLabel"]')
address = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_PanelDetails"]/div[1]/div[2]/div/div[1]/p[3]')
postal = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_CpLabel"]')
city = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_VilleLabel"]')
phone = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_TelHyperLink"]')
email = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_EmailHyperLink"]')
print(company," ",postal," ",city," ",phone," ",email)
back = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_LButtonRetour"]')
back.send_keys(Keys.RETURN)
time.sleep(10)
driver.quit()
Upvotes: 0
Views: 108
Reputation: 9969
wait = WebDriverWait(driver, 20)
driver.get('https://www.outils.ffbatiment.fr/federation-francaise-du-batiment/laffb/annuaire.html')
# Clicks the accept button
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div#cookiescript_accept"))).click()
except:
pass
sl1= driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_SectAct"]')
selectOne = Select(sl1)
selectOne.select_by_value('1')
sl2= driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Act"]')
selectTwo = Select(sl2)
selectTwo.select_by_value('704')
sl3=driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Departement"]')
selectThree = Select(sl3)
selectThree.select_by_value('85')
button = driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_Button1"]')
button.click()
getData = driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00"]/tbody')
getTr = getData.find_elements(By.TAG_NAME, "tr")
print("size ", len(getTr))
length = len(getTr)+1
for i in range(1,length):
driver.find_element(By.XPATH,f"(//tr[@class='rgRow']//a)[{i}]").click()
address = [x.text for x in driver.find_elements(By.XPATH,'//*[starts-with(@id,"ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_AdresseLabel")]')]
address=''.join(map(str, address))
print(address)
driver.back()
driver.refresh()
I didn't do them all but this is a good start for you to figure how to handle elements when going through pages. Remember if you can't find an element to handle it properly.
Outputs:
size 20
Z.A. Sud42, rue du Commerce
ZA LES NOUETTES
RUE DES FRERESZA DE BRECHARD
ZI des Plesses118 rue des PlessesCHATEAU D OLONNE
11 IMPASSE DU VIGNAUDLA VERRIE
2 RUE JOSEPH CUGNOTZONE DES ROCHES
Z.A. Sud42, rue du Commerce
704 ROUTE DES ABOIRESLE MOTTEAU
60 avenue Villebois MareuilMONTAIGU
PARC D'ACTIVITE DE LA BLOIRE6 RUE FRANCOIS MANSART
PARC D'ACTIVITE DE LA BLOIRE6 RUE FRANCOIS MANSART
704 ROUTE DES ABOIRESLE MOTTEAU
9 RUE DE LA CHAPELLEFONTAINES
ZA la promenade200 allée du pré chacun
ZA DU CHARFAIT
RUE DU MOULIN GROSMONTAIGU
3 ZA DES CINQ MOULINS
ROUTE DU COMMERCE
ROUTE DE ST GILLESZA ESPACE OCEANE
Upvotes: 1