Reputation: 149
I am learning Python and decided to do a web-scraping project, where I am using Beautifulsoup and Selenium
Site: https://careers.amgen.com/ListJobs?
Goal: retrieve all the variables related to a job add. Variables identified: ID, jobs, URL, city, state, zip, country, day of the job post.
Problem: I managed to extract the data from the first page of the table. However, I cannot extract the data from all other pages of the table. And I did use the option to go to the next page.
Any help would be much appreciated.
Please find my code below.
import re
import os
import selenium
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from bs4 import BeautifulSoup
#driver = webdriver.Chrome(ChromeDriverManager().install())
browser = webdriver.Chrome("") #path needed to execute chromedriver. Check your own
#path
browser.get('https://careers.amgen.com/ListJobs?')
browser.implicitly_wait(100)
soup = BeautifulSoup(browser.page_source, 'html.parser')
code_soup = soup.find_all('tr', attrs={'role': 'row'})
# creating data set
df =pd.DataFrame({'id':[],
'jobs':[],
'url':[],
'city':[],
'state':[],
'zip':[],
'country':[],
'added':[]
})
d = code_soup
next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')
for i in range(2,12): #catch error, out of bonds?
df = df.append({'id' : d[i].find_all("td", {"class": "DisplayJobId-cell"}),
"jobs" : d[i].find_all("td", {"class":"JobTitle-cell"}),
"url" : d[i].find("a").attrs['href'],
"city" : d[i].find_all("td", {"class": "City-cell"}),
"state" : d[i].find_all("td", {"class": "State-cell"}),
"zip" : d[i].find_all("td", {"class": "Zip-cell"}),
"country" : d[i].find_all("td", {"class": "Country-cell"}),
"added" : d[i].find_all("td", {"class": "AddedOn-cell"})}, ignore_index=True)
df['url'] = 'https://careers.amgen.com/' + df['url'].astype(str)
df["company"] = "Amgen"
df
#iterate through the pages
next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')
for p in range(1,7): #go from page 1 to 6
next_page.click()
browser.implicitly_wait(20)
print(p)
>quote
I tried multiple things, this is my last multiple attempt. It did not work:
```
p = 0
next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')
for p in range(1,7):
for i in range(2,12):
df1 = df.append({'id' : d[i].find_all("td", {"class": "DisplayJobId-cell"}),
"jobs" : d[i].find_all("td", {"class":"JobTitle-cell"}),
"url" : d[i].find("a").attrs['href'],
"city" : d[i].find_all("td", {"class": "City-cell"}),
"state" : d[i].find_all("td", {"class": "State-cell"}),
"zip" : d[i].find_all("td", {"class": "Zip-cell"}),
"country" : d[i].find_all("td", {"class": "Country-cell"}),
"added" : d[i].find_all("td", {"class": "AddedOn-cell"})}, ignore_index=True)
p += 1
next_page.click()
print(p)
Upvotes: 0
Views: 281
Reputation: 305
Changing your code at one line will do the work for you. Instead of the existing xpath you are using to choose the 'next' arrow to change the table, you can use the following xpath.
>>> next_page = browser.find_element_by_xpath('//a[@class="k-link k-pager-nav"]//following::a[@class="k-link k-pager-nav"]')
Upvotes: 0
Reputation: 11505
import requests
import re
import pandas as pd
params = {
'sort': 'AddedOn-desc',
'page': '1',
'pageSize': '1000',
'group': '',
'filter': '',
'fields': 'JobTitle,DisplayJobId,City,State,Zip,Country,AddedOn,UrlJobTitle'
}
headers = {
"Origin": 'https://careers.amgen.com'
}
def main(url):
r = requests.get(url)
api = re.search('JobsApiUrl="(.*?)\"', r.text).group(1)
r = requests.get(api, params=params, headers=headers).json()
df = pd.DataFrame(r['Data'])
print(df)
df.to_csv("data.csv", index=False)
main("https://careers.amgen.com/ListJobs")
Output: view-online
Sample:
Upvotes: 1