Reputation: 690
This one is an odd one I ran this code in the morning and it worked just fine on the html from page. Now when I run it the tables variable comes back 0 items so the for loop never happens and no data is collected or data frame created
def parseForclosure(pagesource):
data = []
soup = BeautifulSoup(pagesource,'html.parser')
tables = soup.find_all('table', attrs={'class':'ad_tab'})
print(len(tables))
df2 = pd.DataFrame()
for i in range(len(tables)):
print(i)
table_body = tables[i].find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
data2 ={'AuctionType': [data[0]] ,
'CaseNo': [data[1]],
'FinalJudgmentAmount': [data[2]],
'ParcelID': [data[3]],
'PropertyAddress1': [data[4]],
'PropertyAddress2': [data[5]],
'AssessedValue': [data[6]],
'PlaintiffMaxBid': [data[7]]}
df = pd.DataFrame(data2, columns=['AuctionType','CaseNo','FinalJudgmentAmount','ParcelID','PropertyAddress1','PropertyAddress2','AssessedValue','PlaintiffMaxBid'] )
df2 = df2.append(df)
print(df)
return(df2)
Here is the call
df = parseForclosure(source)
Here is a snippet of what the html look like
<table class="ad_tab" tabindex="0"><tbody><tr><th class="AD_LBL" scope="row">Auction Type:</th><td class="AD_DTA">FORECLOSURE</td></tr><tr><th aria-label="Case Number" class="AD_LBL" scope="row">Case #:</th><td class="AD_DTA"><a href="/index.cfm?zaction=auction&zmethod=details&AID=103757&bypassPage=1">07009032CA01</a></td></tr><tr><th class="AD_LBL" scope="row">Final Judgment Amount:</th><td class="AD_DTA">$323,248.61</td></tr><tr><th class="AD_LBL" scope="row">Parcel ID:</th><td class="AD_DTA">30-6901-001-2470</td></tr><tr><th class="AD_LBL" scope="row">Property Address:</th><td class="AD_DTA">12260 SW 191 ST</td></tr><tr><th class="AD_LBL" scope="row"></th><td class="AD_DTA">MIAMI, FL- 33177</td></tr> <tr><th class="AD_LBL" scope="row">Assessed Value:</th><td class="AD_DTA">$184,791.00</td></tr><tr><th class="AD_LBL" scope="row">Plaintiff Max Bid:</th><td class="AD_DTA ASTAT_MSGPB">Hidden</td></tr></tbody></table>
You can see sample of all the tables in the link below.
https://projectcodesamples.s3.amazonaws.com/AuctionSample.html
My objective is place data points into a dataframe
Sample file with missing data points:
This is a sample file with all datapoints
Sample_file_with_no_missing_data_points
Ideally I should be able to extract from both without the dataframe size changing
Upvotes: 0
Views: 94
Reputation: 1610
Let's say that you have three HTML files with the data you provided since you first posted your question:
I have used this updated code to combine all the data in one dataframe:
import io
import csv
from bs4 import BeautifulSoup
import pandas as pd
input_files_names = [
'Source.html',
'Source2.html',
'Source3.html'
]
def setup_dataframes(files_names):
for current_file_name in files_names:
with open(current_file_name) as source_file:
soup = BeautifulSoup(source_file, 'html.parser')
field_labels = {
'AuctionType': 'Auction Type:',
'CaseNo': 'Case #:',
'JudgementAmount': 'Final Judgment Amount:',
'ParcelID': 'Parcel ID:',
'AssessedValue': 'Assessed Value:',
'PlaintiffMaxBid': "Plaintiff Max Bid:"
}
column_names = (
'AuctionType',
'CaseNo',
'JudgementAmount',
'ParcelID',
'PropertyAddress1',
'PropertyAddress2',
'AssessedValue',
'PlaintiffMaxBid'
)
def extract_data(soup):
for current_table in soup.find_all('table', class_='ad_tab'):
current_auction = {}
for (current_field, current_labal) in field_labels.items():
current_field_cell = current_table.tbody.find('th', string=current_labal)
if current_field_cell is not None:
current_data_cell = current_field_cell.next_sibling
current_auction[current_field] = current_data_cell.get_text()
address_row = current_table.tbody.find('th', string='Property Address:')
if address_row is not None:
current_auction['PropertyAddress1'] = address_row.find_next_sibling('td').get_text()
address2_row = address_row.parent.next_sibling.td
if address2_row is not None:
current_auction['PropertyAddress2'] = address2_row.get_text()
yield tuple(current_auction.get(current_field, '') for current_field in column_names)
with io.StringIO() as intermediate_data:
intermediate_csv = csv.writer(intermediate_data)
intermediate_csv.writerows(extract_data(soup))
intermediate_data.seek(0, 0)
df = pd.read_csv(intermediate_data, header=None, names=column_names)
yield df
df_composite = pd.concat(setup_dataframes(input_files_names), ignore_index=True)
print(df_composite)
What has been done here is:
If you are processing a lot of data you may consider writing to a real file instead instead of using an in-memory file.
Upvotes: 2