Reputation: 1155
I am trying to get second table elements for a list of links and store them as a pandas dataframe, to accomplish this task I defined a function getCitySalaryTable()
:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
'https://www.salario.com.br/profissao/abade-cbo-263105']
def getCitySalaryTable(job_title_urls, city_salary_df):
for url in job_title_urls:
original_url= url
url = requests.get(url)
soup=BeautifulSoup(url.text, 'lxml')
tables=soup.find_all('table', attrs={'class':'listas'})
# I suspect the problem is here #
city_salary_table=tables[1]
#################################
# extracting column names
heads= city_salary_table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
# extracting rows
data = {k:[] for k in colnames}
rows = city_salary_table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
#print(data)
# Constructing a pandas dataframe using the data just parsed
"""
adding keys: cbo, job_title
"""
cbo = original_url.split('/')[-1].split('-')[-1]
job_title = original_url.split('/')[-1].split('-')[0]
df = pd.DataFrame.from_dict(data)
df.insert(0,'cbo','')
df['cbo'] = cbo
df.insert(1, 'job_title', '')
df['job_title'] = job_title
city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)
return city_salary_df
However when applied:
city_salary_df = pd.DataFrame()
city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)
It returns a dataframe just for the first link, I suspect that the index in the function (city_salary_table=tables[1]
) is not correct for other links.
# cbo job_title ... Salário/Hora Total
#0 612510 abacaxicultor ... 6,16 29
#1 612510 abacaxicultor ... 5,96 6
#2 612510 abacaxicultor ... 6,03 4
#3 612510 abacaxicultor ... 16,02 4
#4 612510 abacaxicultor ... 4,75 3
#5 612510 abacaxicultor ... 5,13 3
#[6 rows x 9 columns]
How could I properly tell the function to return me just the second table for all links?
Upvotes: 0
Views: 48
Reputation: 84465
Use nth-of-type if it is truly the 2nd table
soup.select_one('table:nth-of-type(2)')
Though a class selector is faster than type selector
soup.select_one('.listas:nth-of-type(2)')
import request
from bs4 import BeautifulSoup as bs
soup = bs(requests.get('https://www.salario.com.br/profissao/abacaxicultor-cbo-612510').text, 'lxml')
soup.select_one('.listas:nth-of-type(2)')
Your last link doesn't have that table so add a check on whether city_salary_table is None
:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
'https://www.salario.com.br/profissao/abade-cbo-263105',
'https://www.salario.com.br/profissao/abadessa-cbo-263105',
'https://www.salario.com.br/profissao/abanador-na-agricultura-cbo-622020']
def getCitySalaryTable(job_title_urls, city_salary_df):
for url in job_title_urls:
r = requests.get(url)
print()
soup=BeautifulSoup(r.text, 'lxml')
# I suspect the problem is here #
city_salary_table = soup.select_one('.listas:nth-of-type(2)')
#################################
if city_salary_table is not None:
# extracting column names
heads= city_salary_table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
# extracting rows
data = {k:[] for k in colnames}
rows = city_salary_table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
#print(data)
# Constructing a pandas dataframe using the data just parsed
"""
adding keys: cbo, job_title
"""
cbo = url.split('/')[-1].split('-')[-1]
job_title = url.split('/')[-1].split('-')[0]
df = pd.DataFrame.from_dict(data)
df.insert(0,'cbo','')
df['cbo'] = cbo
df.insert(1, 'job_title', '')
df['job_title'] = job_title
city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)
return city_salary_df
city_salary_df = pd.DataFrame()
city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)
print(city_salary_df)
Google colab:
I think that Google Colab is using an ancient version of soupsieve and we are not seeing the not implemented error being reported for nth-of-type. Instead, you can use city_salary_table = soup.select_one('table + table')
Upvotes: 1