Reputation: 63
I'm trying to extract data from a website. The webpage has more than a single page, so I'm trying to use a loop to iterate through the different pages. The issue with this however is that I can't get the href from the next button.
Can anyone please explain how I can possible fix this?
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
def get_url(position, location):
"Generate a url from position and location"
template = 'https://mx.indeed.com/jobs?q={}&l={}'
url = template.format(position, location)
return url
def get_record(card):
spantag = card.h2.span
job_title = spantag.get('title')
job_url = 'https://www.indeed.com' + card.get('href')
company = card.find('span', 'companyName').text
job_location = card.find('div', 'companyLocation').text
job_summary = card.find('div', 'job-snippet').text.strip()
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
record = (job_title, company, job_location, post_date, today, job_summary, job_url)
return record
def main(position, location):
records = []
url = get_url(position, location)
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('a', 'tapItem')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://mx.indeed.com' + soup.find('a', {'aria-label':'Siguiente »'}).get('href')
except AttributeError:
break
with open('results_Indeed.csv', 'w', newline = '', encoding = 'utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Comany', 'Location', 'PostDate', 'Date', 'Summary', 'URL'])
writer.writerows(records)
Upvotes: 0
Views: 109
Reputation: 550
The buttons at the bottom of the page are ordered as a list, and the href
is a child of the list item, as you can see in the screen shot below.
However, I would suggest another approach: try using mechanize
. It's a very simple-to-use library that lets you manipulate web pages as if you were using a browser. With this library you could just simulate a click on the next
button without having to grab its link address.
Upvotes: 0
Reputation: 195408
Try CSS selector [aria-label*="Siguiente"]
for searching for next URL:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
def get_url(position, location):
"Generate a url from position and location"
template = "https://mx.indeed.com/jobs?q={}&l={}"
url = template.format(position, location)
return url
def get_record(card):
spantag = card.h2.span
job_title = spantag.get("title")
job_url = "https://www.indeed.com" + card.get("href")
company = card.find("span", "companyName").text
job_location = card.find("div", "companyLocation").text
job_summary = card.find("div", "job-snippet").text.strip()
post_date = card.find("span", "date").text
today = datetime.today().strftime("%Y-%m-%d")
record = (
job_title,
company,
job_location,
post_date,
today,
job_summary,
job_url,
)
return record
def main(position, location):
records = []
url = get_url(position, location)
while True:
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
cards = soup.find_all("a", "tapItem")
for card in cards:
record = get_record(card)
records.append(record)
url = soup.select_one('[aria-label*="Siguiente"]')
if not url:
break
url = "https://mx.indeed.com" + url["href"]
df = pd.DataFrame(
records,
columns=[
"JobTitle",
"Comany",
"Location",
"PostDate",
"Date",
"Summary",
"URL",
],
)
print(df)
df.to_csv("data.csv", index=False)
main("Python", "Monterrey")
Creates data.csv
(screenshot from LibreOffice):
Upvotes: 1