Yankzz
Yankzz

Reputation: 41

python webscraping with selenium repeating certain process

i try to scrape the contact data from companies from this website:

https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=4

I can do this with the following Code:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink

contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data:
    company_list.append(cn.text) # this stores the text in the list

driver.back() #navigate to previous site

time.sleep(5) #wait for the pop-up window to appear

driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup

time.sleep(5) #wait for the popup to vanish

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink

contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data2:
    company_list.append(cn.text) # this stores the text in the list

print(company_list) #show the list

My Output is this:

['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']

Problem:

I want, that my code does this to the whole list on page 1 and then goes on on the next page and do it again. This shall go on until I have for example 100 adresses in the list. I would do this with a "while loop" but my xpaths for finding the adress are too specified, so it would always loop the same companies.

Thanks a lot inbefore

Upvotes: 0

Views: 184

Answers (2)

Yankzz
Yankzz

Reputation: 41

Thanks to Dilip Meghwals comment above i could finish my Code:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

company_list= [] #create empty list

count = 25

chrome_options = webdriver.ChromeOptions()

prefs = {"profile.default_content_setting_values.notifications" : 2}

chrome_options.add_experimental_option("prefs",prefs)

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies


    while len(company_list) < 1000:
            
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

            elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
            # To iterate over the company list and click on the company name then capture the address on navigated page
            # come back to previous page and repeat the same.
            
            for i in range(elementsSize):
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
                elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
                company_name = elements[i].text
                elements[i].click()  # click on the first company namelink
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
                contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
                # print(contact_data)
                company_list.append(contact_data)
                driver.back()  # navigate to previous site
                            
            time.sleep(5)
    
            driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();

company_list = [w.replace('\n', ', ') for w in company_list]

print(company_list)

df_company_name = pd.DataFrame(company_list, columns =['Name'])

df_company_name.to_excel("company_name.xlsx") 

Upvotes: 0

Dilip Meghwal
Dilip Meghwal

Reputation: 632

Try below code for one page data extract. Update the code for iterating over the next page records.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome() #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies

WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
    WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
    elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
    company_name = elements[i].text
    elements[i].click()  # click on the first company namelink
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
                                                                '//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
    contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
    # print(contact_data)
    company_list.append(company_name + " : " + contact_data)
    driver.back()  # navigate to previous site

print(company_list)

Upvotes: 1

Related Questions