Reputation: 51
I've been trying to iterate through this EV company table on Crunchbase but for some reason the code is only pulling up the first row. Any idea as to why ? Thanks ! :)
#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
#paths
PATH = "C:/Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.crunchbase.com/search/organizations/field/organization.companies/categories/electric-vehicle")
driver.maximize_window()
time.sleep(5)
print(driver.title)
WebDriverWait(driver, 20).until(
EC.visibility_of_element_located(
(By.XPATH, ('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[2]/div/field-formatter/identifier-formatter/a/div/div')
)))
companies = driver.find_elements_by_css_selector("div.identifier-label")
#create company dictionary and iterate through Crunchbase EV company table
company_list = []
for company in companies:
name = company.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[2]/div/field-formatter/identifier-formatter/a/div/div').text
industry = company.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[3]/div/field-formatter/identifier-multi-formatter/span').text
hq = company.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row[1]/grid-cell[4]/div/field-formatter/identifier-multi-formatter/span').text
cblist = {
'name': name,
'industry': industry,
'hq': hq
}
company_list.append(cblist)
#create dataframe
df = pd.DataFrame(company_list)
print(df)
Upvotes: 1
Views: 2066
Reputation: 142641
First you should get all grid-row
to get all rows in table and later you should use relative xpath (starting with .
) to search only in selected row.
all_rows = driver.find_elements_by_css_selector("grid-row")
all_companies = []
for row in all_rows:
company = {
'name': row.find_element_by_xpath('.//*[@class="identifier-label"]').text.strip(),
'industry': row.find_element_by_xpath('.//*[@data-columnid="categories"]//span').text.strip(),
'hq': row.find_element_by_xpath('.//*[@data-columnid="location_identifiers"]//span').text.strip(),
'cb rank': row.find_element_by_xpath('.//*[@data-columnid="rank_org"]').text.strip(),
}
all_companies.append(company)
You should also learn to use class
, id
and any other unique values - ie. data-columnid
.
Full working code
#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
#paths
PATH = "C:/Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
#driver = webdriver.Chrome()
url = "https://www.crunchbase.com/search/organizations/field/organization.companies/categories/electric-vehicle"
driver.get(url)
driver.maximize_window()
time.sleep(5)
print('title:', driver.title)
WebDriverWait(driver, 20).until(
EC.visibility_of_element_located(
(By.XPATH, ('//grid-body//identifier-formatter/a/div/div')
)))
all_rows = driver.find_elements_by_css_selector("grid-row")
all_companies = []
for row in all_rows:
company = {
'name': row.find_element_by_xpath('.//*[@class="identifier-label"]').text.strip(),
'industry': row.find_element_by_xpath('.//*[@data-columnid="categories"]//span').text.strip(),
'hq': row.find_element_by_xpath('.//*[@data-columnid="location_identifiers"]//span').text.strip(),
'cb rank': row.find_element_by_xpath('.//*[@data-columnid="rank_org"]').text.strip(),
}
all_companies.append(company)
#create dataframe
df = pd.DataFrame(all_companies)
print(df)
Upvotes: 2
Reputation: 61
Increase grid-row index for each iteration within for loop in all identifiers like..
row_index = row_index + 1
name = company.find_element_by_xpath(
'/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/search/page-layout/div/div/form/div[2]/results/div/div/div[3]/sheet-grid/div/div/grid-body/div/grid-row['+str(row_index)+']/grid-cell[2]/div/field-formatter/identifier-formatter/a/div/div').text
Upvotes: 1