How to index over a table tag in order to return a pandas df for a list of links?

Question

I am trying to get second table elements for a list of links and store them as a pandas dataframe, to accomplish this task I defined a function getCitySalaryTable():

from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd

job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
                 'https://www.salario.com.br/profissao/abade-cbo-263105']

def getCitySalaryTable(job_title_urls, city_salary_df):

 for url in job_title_urls:

    original_url= url
    url = requests.get(url)
    soup=BeautifulSoup(url.text, 'lxml')

    tables=soup.find_all('table', attrs={'class':'listas'})
    
    # I suspect the problem is here #
    city_salary_table=tables[1]

    #################################
    
    # extracting column names
    heads= city_salary_table.find('thead').find('tr').find_all('th')
    colnames = [hdr.text for hdr in heads]

    # extracting rows 

    data = {k:[] for k in colnames}
    rows = city_salary_table.find('tbody').find_all('tr')
    for rw in rows:
        for col in colnames:
            cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
            data[col].append(cell.text)
            #print(data)

    # Constructing a pandas dataframe using the data just parsed
    """
    adding keys: cbo, job_title
    """
    cbo = original_url.split('/')[-1].split('-')[-1]
    job_title = original_url.split('/')[-1].split('-')[0]

    df = pd.DataFrame.from_dict(data)

    df.insert(0,'cbo','')
    df['cbo'] = cbo
    
    df.insert(1, 'job_title', '')
    df['job_title'] = job_title
    

    city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)
   
    return city_salary_df

However when applied:

city_salary_df = pd.DataFrame()

city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)

It returns a dataframe just for the first link, I suspect that the index in the function (city_salary_table=tables[1]) is not correct for other links.

#      cbo      job_title  ... Salário/Hora Total
#0  612510  abacaxicultor  ...         6,16    29
#1  612510  abacaxicultor  ...         5,96     6
#2  612510  abacaxicultor  ...         6,03     4
#3  612510  abacaxicultor  ...        16,02     4
#4  612510  abacaxicultor  ...         4,75     3
#5  612510  abacaxicultor  ...         5,13     3

#[6 rows x 9 columns]

How could I properly tell the function to return me just the second table for all links?

QHarr · Accepted Answer

Use nth-of-type if it is truly the 2nd table

soup.select_one('table:nth-of-type(2)')

Though a class selector is faster than type selector

soup.select_one('.listas:nth-of-type(2)')

import request
from bs4 import BeautifulSoup as bs

soup = bs(requests.get('https://www.salario.com.br/profissao/abacaxicultor-cbo-612510').text, 'lxml')
soup.select_one('.listas:nth-of-type(2)')

Your last link doesn't have that table so add a check on whether city_salary_table is None:

from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd

job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
                'https://www.salario.com.br/profissao/abade-cbo-263105',
                'https://www.salario.com.br/profissao/abadessa-cbo-263105',
                'https://www.salario.com.br/profissao/abanador-na-agricultura-cbo-622020']

def getCitySalaryTable(job_title_urls, city_salary_df):

    for url in job_title_urls:

        r = requests.get(url)
        print()
        soup=BeautifulSoup(r.text, 'lxml')

        # I suspect the problem is here #
        city_salary_table = soup.select_one('.listas:nth-of-type(2)')

        #################################
        if city_salary_table is not None:
            # extracting column names
            heads= city_salary_table.find('thead').find('tr').find_all('th')
            colnames = [hdr.text for hdr in heads]

            # extracting rows 

            data = {k:[] for k in colnames}
            rows = city_salary_table.find('tbody').find_all('tr')
            for rw in rows:
                for col in colnames:
                    cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
                    data[col].append(cell.text)
                    #print(data)

            # Constructing a pandas dataframe using the data just parsed
            """
            adding keys: cbo, job_title
            """
            cbo = url.split('/')[-1].split('-')[-1]
            job_title = url.split('/')[-1].split('-')[0]

            df = pd.DataFrame.from_dict(data)

            df.insert(0,'cbo','')
            df['cbo'] = cbo

            df.insert(1, 'job_title', '')
            df['job_title'] = job_title

            city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)

    return city_salary_df
    
city_salary_df = pd.DataFrame()

city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)
print(city_salary_df)

Google colab:

I think that Google Colab is using an ancient version of soupsieve and we are not seeing the not implemented error being reported for nth-of-type. Instead, you can use city_salary_table = soup.select_one('table + table')

How to index over a table tag in order to return a pandas df for a list of links?

Answers (1)

Related Questions