Reputation: 13
I am trying to build a web scraper that will go through a website's pages and download the excel files from a dropdown menu at the bottom of the page.
The webpages only allow me to download the 50 locations that are displayed on each page and I cannot download all of them at once.
I am able to download the first page's Excel file, but the following pages yield nothing else.
I get the following output after running the code I have provided below.
Skipped a page
No more pages.
If I exclude the lines where it asks to download the pages, it is able to go through each page until the end successfully.
I'll provide an example below for what I am trying to get accomplished.
I would appreciate any help and advice! Thank you!
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
state = 'oklahoma'
rent_to_own = 'rent to own'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx')
industry = driver.find_element(By.ID, "txtKeyword")
industry.send_keys(rent_to_own)
location = driver.find_element(By.ID, "txtLocation")
location.send_keys(state)
driver.find_element(By.ID, "btnSubmit").click()
driver.implicitly_wait(3)
def web_scrape():
more_drawer = driver.find_element(By.XPATH, "//div[@class='more-drawer']//a[@href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y#Des']")
more_drawer.click()
driver.implicitly_wait(5)
get_50 = Select(driver.find_element(By.ID, 'ViewPerPage'))
get_50.select_by_value('50')
driver.implicitly_wait(5)
filter_description = driver.find_element(By.XPATH, "//ul[@class='filters-list']//a[@href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y&pagesize=50¤tpage=1&descfilter=Furniture~B~Renting ~F~ Leasing']")
filter_description.click()
while True:
try:
download_excel = Select(driver.find_element(By.ID, 'ResultsDownload'))
download_excel.select_by_value('Excel')
driver.implicitly_wait(20)
first_50 = driver.find_element(By.XPATH, "//div[@id='relatedOccupations']//a[@onclick='hideMoreRelatedOccupations()']")
first_50.click()
driver.implicitly_wait(20)
next_page = driver.find_element(By.XPATH, "//div[@class='pagination-wrap']//div//a[@class='next-page']")
next_page.click()
driver.implicitly_wait(20)
print("Skipped a page.")
except:
print("No more pages.")
return
web_scrape()
Upvotes: 0
Views: 372
Reputation: 457
Below is something that works. Again I would think the way I went about this could be improved. I stuck with Selenium but you really don't even need to open the webpage and can just webscrape using correct URL params with Beautiful Soup. Also the fastest way was probably not to write every item into excel one at a time but it works, better way is probably using pandas and then creating an excel workbook at the end. But anyway if you have any questions let me know.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import openpyxl as xl
import os
import math
cwd = os.getcwd() #Or whatever dir you want
filename = '\test123.xlsx'
location = 'oklahoma'
keyword = 'rent to own'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=1&lang=en')
driver.implicitly_wait(3)
wb = xl.Workbook()
ws = wb.worksheets[0]
#get number of pages
ret = driver.find_element(By.ID, 'recordNumber')
lp = math.ceil(float(ret.text)/50)
r = 1
for i in range(1, lp):
print(i)
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=' + str(i) + '&lang=en')
table_id = driver.find_elements(By.CLASS_NAME, 'res-table')[0]
rows = table_id.find_elements(By.TAG_NAME, "tr")
for count, row in enumerate(rows, start=1):
if count >= 0:
cols = row.find_elements(By.TAG_NAME, "td")
refs = row.find_elements(By.TAG_NAME, "a")
for c, ref in enumerate(refs, start=1):
ws.cell(row=r, column=c).value = '=HYPERLINK("{}", "{}")'.format(ref.get_attribute("href"), ref.text)
for c, col in enumerate(cols, start=1):
if c > 1:
ws.cell(row=r, column=c).value = col.text
r += 1
wb.save(cwd + filename)
print('done')
This returns an excel file with 750+ rows of data with links included.
Upvotes: 1