python webscraping with selenium repeating certain process

Question

i try to scrape the contact data from companies from this website:

https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=4

I can do this with the following Code:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink

contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data:
    company_list.append(cn.text) # this stores the text in the list

driver.back() #navigate to previous site

time.sleep(5) #wait for the pop-up window to appear

driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup

time.sleep(5) #wait for the popup to vanish

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink

contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data2:
    company_list.append(cn.text) # this stores the text in the list

print(company_list) #show the list

My Output is this:

['GUTex GmbH
Gerhard-Unland-Str. 1
26683
Saterland
Deutschland', 'Robert Bosch GmbH
Robert-Bosch-Platz 1
70839
Gerlingen
Deutschland']

Problem:

I want, that my code does this to the whole list on page 1 and then goes on on the next page and do it again. This shall go on until I have for example 100 adresses in the list. I would do this with a "while loop" but my xpaths for finding the adress are too specified, so it would always loop the same companies.

Thanks a lot inbefore

Dilip Meghwal · Accepted Answer

Try below code for one page data extract. Update the code for iterating over the next page records.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome() #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies

WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
    WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
    elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
    company_name = elements[i].text
    elements[i].click()  # click on the first company namelink
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
                                                                '//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
    contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
    # print(contact_data)
    company_list.append(company_name + " : " + contact_data)
    driver.back()  # navigate to previous site

print(company_list)

python webscraping with selenium repeating certain process

Answers (2)

Related Questions