Benjamin Lopez
Benjamin Lopez

Reputation: 63

Iterating through pages in Python using beautifulsoup

I'm trying to extract data from a website. The webpage has more than a single page, so I'm trying to use a loop to iterate through the different pages. The issue with this however is that I can't get the href from the next button.

Can anyone please explain how I can possible fix this?

import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_url(position, location):
    "Generate a url from position and location"
    template = 'https://mx.indeed.com/jobs?q={}&l={}'
    url = template.format(position, location)
    return url

def get_record(card):
    spantag = card.h2.span
    job_title = spantag.get('title')
    job_url = 'https://www.indeed.com' + card.get('href')
    company = card.find('span', 'companyName').text
    job_location = card.find('div', 'companyLocation').text
    job_summary = card.find('div', 'job-snippet').text.strip()
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    
    record = (job_title, company, job_location, post_date, today, job_summary, job_url)
    
    return record

def main(position, location):
    records = []
    url = get_url(position, location)
    
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('a', 'tapItem')

        for card in cards:
            record = get_record(card)
            records.append(record)

            try:
                url = 'https://mx.indeed.com' + soup.find('a', {'aria-label':'Siguiente »'}).get('href')
            except AttributeError:
                break
    
    with open('results_Indeed.csv', 'w', newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Comany', 'Location', 'PostDate', 'Date', 'Summary', 'URL'])
        writer.writerows(records)

Upvotes: 0

Views: 109

Answers (2)

Nir H.
Nir H.

Reputation: 550

The buttons at the bottom of the page are ordered as a list, and the href is a child of the list item, as you can see in the screen shot below.

enter image description here

However, I would suggest another approach: try using mechanize. It's a very simple-to-use library that lets you manipulate web pages as if you were using a browser. With this library you could just simulate a click on the next button without having to grab its link address.

Upvotes: 0

Andrej Kesely
Andrej Kesely

Reputation: 195408

Try CSS selector [aria-label*="Siguiente"] for searching for next URL:

import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup


def get_url(position, location):
    "Generate a url from position and location"
    template = "https://mx.indeed.com/jobs?q={}&l={}"
    url = template.format(position, location)
    return url


def get_record(card):
    spantag = card.h2.span
    job_title = spantag.get("title")
    job_url = "https://www.indeed.com" + card.get("href")
    company = card.find("span", "companyName").text
    job_location = card.find("div", "companyLocation").text
    job_summary = card.find("div", "job-snippet").text.strip()
    post_date = card.find("span", "date").text
    today = datetime.today().strftime("%Y-%m-%d")

    record = (
        job_title,
        company,
        job_location,
        post_date,
        today,
        job_summary,
        job_url,
    )

    return record


def main(position, location):
    records = []
    url = get_url(position, location)

    while True:
        print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.find_all("a", "tapItem")

        for card in cards:
            record = get_record(card)
            records.append(record)

        url = soup.select_one('[aria-label*="Siguiente"]')
        if not url:
            break
        url = "https://mx.indeed.com" + url["href"]

    df = pd.DataFrame(
        records,
        columns=[
            "JobTitle",
            "Comany",
            "Location",
            "PostDate",
            "Date",
            "Summary",
            "URL",
        ],
    )
    print(df)
    df.to_csv("data.csv", index=False)


main("Python", "Monterrey")

Creates data.csv (screenshot from LibreOffice):

enter image description here

Upvotes: 1

Related Questions