truee
truee

Reputation: 51

How to iterate through web table with Selenium?

I've been trying to iterate through this EV company table on Crunchbase but for some reason the code is only pulling up the first row. Any idea as to why ? Thanks ! :)

#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

#paths
PATH = "C:/Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)

driver.get("https://www.crunchbase.com/search/organizations/field/organization.companies/categories/electric-vehicle")
driver.maximize_window()
time.sleep(5)
print(driver.title)

WebDriverWait(driver, 20).until(
        EC.visibility_of_element_located(
          (By.XPATH, ('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[2]/div/field-formatter/identifier-formatter/a/div/div')
        )))

companies = driver.find_elements_by_css_selector("div.identifier-label")

#create company dictionary and iterate through Crunchbase EV company table             
company_list = []
                          
for company in companies:
    name = company.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[2]/div/field-formatter/identifier-formatter/a/div/div').text
    industry = company.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[3]/div/field-formatter/identifier-multi-formatter/span').text
    hq = company.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[4]/div/field-formatter/identifier-multi-formatter/span').text
    cblist = {
        'name': name,
        'industry': industry,
        'hq': hq
    }
    company_list.append(cblist)
#create dataframe    
df = pd.DataFrame(company_list)
print(df)

Upvotes: 1

Views: 2066

Answers (2)

furas
furas

Reputation: 142641

First you should get all grid-row to get all rows in table and later you should use relative xpath (starting with .) to search only in selected row.

all_rows = driver.find_elements_by_css_selector("grid-row")

all_companies = []
                          
for row in all_rows:
    company = {
        'name':     row.find_element_by_xpath('.//*[@class="identifier-label"]').text.strip(),
        'industry': row.find_element_by_xpath('.//*[@data-columnid="categories"]//span').text.strip(),
        'hq':       row.find_element_by_xpath('.//*[@data-columnid="location_identifiers"]//span').text.strip(),
        'cb rank':  row.find_element_by_xpath('.//*[@data-columnid="rank_org"]').text.strip(),
    }
    all_companies.append(company)

You should also learn to use class, id and any other unique values - ie. data-columnid.


Full working code

#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

#paths
PATH = "C:/Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
#driver = webdriver.Chrome()

url = "https://www.crunchbase.com/search/organizations/field/organization.companies/categories/electric-vehicle"
driver.get(url)
driver.maximize_window()
time.sleep(5)

print('title:', driver.title)

WebDriverWait(driver, 20).until(
        EC.visibility_of_element_located(
          (By.XPATH, ('//grid-body//identifier-formatter/a/div/div')
        )))

all_rows = driver.find_elements_by_css_selector("grid-row")

all_companies = []
                          
for row in all_rows:
    company = {
        'name':     row.find_element_by_xpath('.//*[@class="identifier-label"]').text.strip(),
        'industry': row.find_element_by_xpath('.//*[@data-columnid="categories"]//span').text.strip(),
        'hq':       row.find_element_by_xpath('.//*[@data-columnid="location_identifiers"]//span').text.strip(),
        'cb rank':  row.find_element_by_xpath('.//*[@data-columnid="rank_org"]').text.strip(),
    }
    all_companies.append(company)
    
#create dataframe    
df = pd.DataFrame(all_companies)
print(df)

Upvotes: 2

Ashok Kakade
Ashok Kakade

Reputation: 61

Increase grid-row index for each iteration within for loop in all identifiers like..

row_index = row_index + 1

name = company.find_element_by_xpath(
        '/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row['+str(row_index)+']/grid-cell[2]/div/field-formatter/identifier-formatter/a/div/div').text

Upvotes: 1

Related Questions