3laa mamoon
3laa mamoon

Reputation: 7

want to extract data from a site using selenium-wedriver

I want to extract the name, website, phone, and email of every company on the site but the code keeps printing the first company name on the page over and over and crashs if I try to find the website, phone, and email.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

url='https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b'
driver = webdriver.Firefox()
driver.get(url)
wait=WebDriverWait(driver,50)

wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,'#pym-0 > iframe')))
list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
button = wait.until(EC.element_to_be_clickable((By.XPATH,'./html/body/div[5]/div/ul/li[13]/a')))



numOfPages=1161
counter=4

for i in range(numOfPages):

    driver.execute_script("arguments[0].scrollIntoView();", button)

    for e in list:
        name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
        print(name)
        website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
        print(website)
        phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
        print(phone)
        email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
        print(email)
 
    time.sleep(counter)  
    button.click()
    list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
    if i%40==0:
        counter+=1

my problem is with these lines of code

list=wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME,'searched-list ')))
 for e in list:
        name = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[1]/h4').text
        print(name)
        website = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[1]/td[2]/a').text
        print(website)
        phone = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[2]/td[2]/a').text
        print(phone)
        email = e.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div/div[9]/div/div[2]/div[2]/table/tbody/tr[3]/td[2]/a').text
        print(email)

Upvotes: 0

Views: 89

Answers (1)

shiny
shiny

Reputation: 668

i would you suggest to use other functions of finding elements to make your code more readable. I've made a couple of changes in your code, hope that helps you to get the data:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time


url = "https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b"
driver = webdriver.Firefox()
driver.get(url)
wait = WebDriverWait(driver, 50)

wait.until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))
).click()
wait.until(
    EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#pym-0 > iframe"))
)
list = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title ")))
button = wait.until(
    EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "a[ng-click='setPage(pager.currentPage + 1)']")
    )
)

counter = 4


def getText(element):
    text = element.text

    if not text:
        text = "---"

    return text


def getContactInfo(parent):
    element = None
    try:
        element = parent.find_element_by_class_name("contact-info")
    except:
        pass

    return element

while (
    # Last Page has disabled the li element
    not "disabled"
    in driver.find_element_by_css_selector(
        "li[ng-class='{disabled:pager.currentPage === pager.totalPages}']"
    )
    .get_attribute("class")
    .split()
):

    driver.execute_script("arguments[0].scrollIntoView();", button)

    for e in list:
        name = e.find_element_by_tag_name("h4")
        print(getText(name))
        account_info = e.find_element_by_css_selector(
            "div.account-Info.large-12.columns.ng-scope"
        )
        contact_info = getContactInfo(account_info)

        if contact_info:
            website = contact_info.find_element_by_css_selector(
                "a.website.ng-binding.ng-scope"
            )
            print(getText(website))
            phone = contact_info.find_element_by_css_selector("a.telephone.ng-binding")
            print(getText(phone))
            email = contact_info.find_element_by_css_selector("a.emailid.ng-binding")
            print(getText(email))

        print("*******\n")

    button.click()
    time.sleep(counter)
    list = wait.until(
        EC.visibility_of_all_elements_located((By.CLASS_NAME, "list-title "))
    )

driver.quit()

Upvotes: 1

Related Questions