Reputation: 29
I am working on a project where I am scraping indeed search results. I am trying to print the data that I find, but the data from the first page is duplicating. It prints the first page's data twice, and the rest of the page's data once (which is what I want). Please let me know how to prevent my first page's data twice. Thank you!
from selenium import webdriver
from time import sleep
page = 0
# SearchTerm = input("Search Term: ")
SearchTerm = "EHS"
# LocationSearch = input("Location: ")
LocationSearch = "Arizona"
NumPages = 4
Data = []
def removeduplicates(listofelements):
# Create an empty list to store unique elements
uniquelist = []
# Iterate over the original list and for each element
# add it to uniqueList, if its not already there.
for elem in listofelements:
if elem not in uniquelist:
uniquelist.append(elem)
# Return the list of unique elements
return uniquelist
Data = []
url = ('https://www.indeed.com/jobs?q=' + SearchTerm + '&l=' + LocationSearch + '&start=0')
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get(url)
for x in range(NumPages + 1):
driver.get(url)
url = ('https://www.indeed.com/jobs?q=' + SearchTerm + '&l=' + LocationSearch + '&start=' + str(page))
page += 10
# scrape code
Titles = driver.find_elements_by_xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "jobtitle", " " ))]')
TitleText = [x.text for x in Titles]
Data.extend([*TitleText, ","])
CompanyName = driver.find_elements_by_xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "company", " " ))] | //*[contains(concat( " ", @class, " " ), concat( " ", "company", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "turnstileLink", " " ))]')
CompanyNameText = [x.text for x in CompanyName]
Data.extend([*CompanyNameText, ",", "\n"])
sleep(3)
driver.get(url)
print(*Data)
Upvotes: 0
Views: 44
Reputation: 33384
I have made some changes in the code and also find_elements.Induce WebDriverWait
() and wait for visibility_of_all_elements_located
()
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
page = 0
# SearchTerm = input("Search Term: ")
SearchTerm = "EHS"
# LocationSearch = input("Location: ")
LocationSearch = "Arizona"
NumPages = 4
Data = []
for x in range(NumPages + 1):
url = ('https://www.indeed.com/jobs?q=' + SearchTerm + '&l=' + LocationSearch + '&start=' + str(page))
driver.get(url)
time.sleep(1) #slowdown the loop
page += 10
for jobs in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"div[data-tn-component='organicJob']"))):
title=jobs.find_element_by_xpath("./h2/a").text.strip()
try:
name=jobs.find_element_by_xpath(".//span[@class='company']/a").text.strip()
except:
name = jobs.find_element_by_xpath(".//span[@class='company']").text.strip()
Data.append(title)
Data.append(name)
print(Data)
Output:
['EHS and Facilities Manager', 'Howmet Aerospace', 'EHS Coordinator', 'Basalite® Concrete Products LLC', 'EHS Leader', 'Ball Corporation', 'Site EHS Manager (Gatorade)', 'PepsiCo', 'Manager, EHS', 'Bristol-Myers Squibb', 'Safety Manager', 'Allen Industries Inc', 'Senior Site EHS Manager', 'Amazon.com Services LLC', 'TRAFFICE SAFETY ENGINEER SUPERVISOR', 'State of Arizona', 'Environmental, Safety, & Security Manager', 'Holsum Bakery, Inc (0127)', 'Environmental Health & Safety Manager', 'Mark Anthony Brewing Inc.', 'Safety Manager - F-5 Adversary Program - Yuma, AZ', 'Vertex Aerospace LLC', 'Risk and Patient Safety Coordinator Full Time Day Shift Cent...', 'Abrazo Central Campus', 'TRAFFIC SAFETY ENGINEER SUPERVISOR', 'State of Arizona', 'Food Safety and Environmental Health Program Manager', 'State of Arizona', 'EHS Technical Writer', 'Safety Services Company', 'Environmental Health and Safety (EHS) Coordinator', 'Westerwood Global', 'Safety Manager', 'Pioneer Landscape Centers', 'HSE/HSD Instructor', 'Odle Management Group LLC', 'Safety Manager', 'PAC Worldwide', 'Safety Trainer - Heavy Equipment', 'Pioneer Landscape Centers', 'Safety Coordinator Southwest Steel/ SME Industries Inc.', 'SME Steel Industries', 'Risk and Patient Safety Coordinator Full Time Day Shift Cent...', 'Abrazo Central Campus', 'TRAFFIC SAFETY ENGINEER SUPERVISOR', 'State of Arizona', 'Food Safety and Environmental Health Program Manager', 'State of Arizona', 'EHS Technical Writer', 'Safety Services Company', 'Environmental Health and Safety (EHS) Coordinator', 'Westerwood Global', 'Safety Manager', 'Pioneer Landscape Centers', 'HSE/HSD Instructor', 'Odle Management Group LLC', 'Safety Manager', 'PAC Worldwide', 'Safety Trainer - Heavy Equipment', 'Pioneer Landscape Centers', 'Safety Coordinator Southwest Steel/ SME Industries Inc.', 'SME Steel Industries', 'Risk and Patient Safety Coordinator Full Time Day Shift Cent...', 'Abrazo Central Campus', 'TRAFFIC SAFETY ENGINEER SUPERVISOR', 'State of Arizona', 'Food Safety and Environmental Health Program Manager', 'State of Arizona', 'EHS Technical Writer', 'Safety Services Company', 'Environmental Health and Safety (EHS) Coordinator', 'Westerwood Global', 'Safety Manager', 'Pioneer Landscape Centers', 'HSE/HSD Instructor', 'Odle Management Group LLC', 'Safety Manager', 'PAC Worldwide', 'Safety Trainer - Heavy Equipment', 'Pioneer Landscape Centers', 'Safety Coordinator Southwest Steel/ SME Industries Inc.', 'SME Steel Industries']
Upvotes: 1