Bun
Bun

Reputation: 3137

How to parse this html structure using BeautifulSoup?

I would like to parse this TABLE line by line and save to a csv file. What I have done so far, return nothing in the csv file:

Django:

data_scrapper makes a request from Yahoo Finance.

def button_clicked(request):
    headers = []
    rows = []
    gen_table = data_scrapper(symbol)
    soup = BeautifulSoup(gen_table)
    table = soup.find_all('table')
    for table in soup.find_all('table'):
        headers.extend([header.text for header in table.find_all('th')])
    for row in soup.find_all('tr'):
        rows.extend([val.text for val in row.find_all('td')])

    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'attachment; filename= "{}.csv"'.format(symbol)

    writer = csv.writer(response)
    writer.writerow(headers)
    writer.writerows(row for row in rows if row)

    return response

html:

<TABLE class="yfnc_tabledata1" width="100%" cellpadding="0" cellspacing="0" border="0">
  <TR>
    <TD>
      <TABLE width="100%" cellpadding="2" cellspacing="0" border="0">
        <TR class="yfnc_modtitle1" style="border-top:none;">
          <td colspan="2" style="border-top:2px solid #000;">
            <small>
              <span class="yfi-module-title">Period Ending</span>
            </small>
          </td>
          <th scope="col" style="border-top:2px solid #000;text-align:right; font-weight:bold">Dec 31, 2014</th>
          <th scope="col" style="border-top:2px solid #000;text-align:right; font-weight:bold">Dec 31, 2013</th>
          <th scope="col" style="border-top:2px solid #000;text-align:right; font-weight:bold">Dec 31, 2012</th>
        </TR>
        <tr>
          <td colspan="2">
            <strong>
              Total Revenue
            </strong>
          </td>
          <td align="right">
            <strong>
              4,479,648&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              3,777,068&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              3,209,782&nbsp;&nbsp;
            </strong>
          </td>
        </tr>
        <tr>
          <td colspan="2">Cost of Revenue</td>
          <td align="right">3,160,470&nbsp;&nbsp;</td>
          <td align="right">2,656,189&nbsp;&nbsp;</td>
          <td align="right">2,284,485&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; border-top:3px solid #333;">
            <span style="display:block; width:5px; height:1px;"></span>
          </td>
        </tr>
        <tr>
          <td colspan="2">
            <strong>
              Gross Profit
            </strong>
          </td>
          <td align="right">
            <strong>
              1,319,178&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              1,120,879&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              925,297&nbsp;&nbsp;
            </strong>
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; ">
            <span style="display:block; width:5px; height:10px;"></span>
          </td>
        </tr>
        <tr>
          <td>
            <spacer type="block" height="1" width="1" />
          </td>
          <td class="yfnc_d" colspan="4">Operating Expenses</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Research Development</td>
          <td align="right">148,458&nbsp;&nbsp;</td>
          <td align="right">139,193&nbsp;&nbsp;</td>
          <td align="right">127,361&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Selling General and Administrative</td>
          <td align="right">456,030&nbsp;&nbsp;</td>
          <td align="right">403,772&nbsp;&nbsp;</td>
          <td align="right">319,511&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Non Recurring</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Others</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td>
            <spacer type="block" height="1" width="1" />
          </td>
          <td colspan="5" style="height:0; padding:0; " class="yfnc_d">
            <span style="display:block; width:5px; height:1px;"></span>
          </td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Total Operating Expenses</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; ">
            <span style="display:block; width:5px; height:10px;"></span>
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; border-top:3px solid #333;">
            <span style="display:block; width:5px; height:1px;"></span>
          </td>
        </tr>
        <tr>
          <td colspan="2">
            <strong>
              Operating Income or Loss
            </strong>
          </td>
          <td align="right">
            <strong>
              714,690&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              577,914&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              478,425&nbsp;&nbsp;
            </strong>
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; ">
            <span style="display:block; width:5px; height:10px;"></span>
          </td>
        </tr>
        <tr>
          <td>
            <spacer type="block" height="1" width="1" />
          </td>
          <td class="yfnc_d" colspan="4">Income from Continuing Operations</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Total Other Income/Expenses Net</td>
          <td align="right">(10)</td>
          <td align="right">5,139&nbsp;&nbsp;</td>
          <td align="right">7,529&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Earnings Before Interest And Taxes</td>
          <td align="right">710,556&nbsp;&nbsp;</td>
          <td align="right">580,639&nbsp;&nbsp;</td>
          <td align="right">485,775&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Interest Expense</td>
          <td align="right">11,239&nbsp;&nbsp;</td>
          <td align="right">6,210&nbsp;&nbsp;</td>
          <td align="right">5,932&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Income Before Tax</td>
          <td align="right">699,317&nbsp;&nbsp;</td>
          <td align="right">574,429&nbsp;&nbsp;</td>
          <td align="right">479,843&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Income Tax Expense</td>
          <td align="right">245,288&nbsp;&nbsp;</td>
          <td align="right">193,360&nbsp;&nbsp;</td>
          <td align="right">167,533&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Minority Interest</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td>
            <spacer type="block" height="1" width="1" />
          </td>
          <td colspan="5" style="height:0; padding:0; " class="yfnc_d">
            <span style="display:block; width:5px; height:1px;"></span>
          </td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Net Income From Continuing Ops</td>
          <td align="right">454,029&nbsp;&nbsp;</td>
          <td align="right">381,069&nbsp;&nbsp;</td>
          <td align="right">312,310&nbsp;&nbsp;</td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; ">
            <span style="display:block; width:5px; height:10px;"></span>
          </td>
        </tr>
        <tr>
          <td>
            <spacer type="block" height="1" width="1" />
          </td>
          <td class="yfnc_d" colspan="4">Non-recurring Events</td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Discontinued Operations</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">(3,777)</td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Extraordinary Items</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Effect Of Accounting Changes</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td width="30" class="yfnc_tabledata1">
            <spacer type="block" width="30" height="1" />
          </td>
          <td>Other Items</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; ">
            <span style="display:block; width:5px; height:10px;"></span>
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; border-top:3px solid #333;">
            <span style="display:block; width:5px; height:1px;"></span>
          </td>
        </tr>
        <tr>
          <td colspan="2">
            <strong>
              Net Income
            </strong>
          </td>
          <td align="right">
            <strong>
              454,029&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              377,292&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              312,310&nbsp;&nbsp;
            </strong>
          </td>
        </tr>
        <tr>
          <td colspan="2">Preferred Stock And Other Adjustments</td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
          <td align="right">
            - &nbsp;
          </td>
        </tr>
        <tr>
          <td colspan="5" style="height:0;padding:0; border-top:3px solid #333;">
            <span style="display:block; width:5px; height:1px;"></span>
          </td>
        </tr>
        <tr>
          <td colspan="2">
            <strong>
              Net Income Applicable To Common Shares
            </strong>
          </td>
          <td align="right">
            <strong>
              454,029&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              377,292&nbsp;&nbsp;
            </strong>
          </td>
          <td align="right">
            <strong>
              312,310&nbsp;&nbsp;
            </strong>
          </td>
        </tr>
      </TABLE>
    </TD>
  </TR>
</TABLE>

Upvotes: 1

Views: 130

Answers (1)

cameron-f
cameron-f

Reputation: 431

Here's some code that makes a csv that looks like the table. The csvs I usually work with have a row as a complete record. So all the values in column one would be the csv header. Just something to think about, it might be helpful

Python 3.4

from bs4 import BeautifulSoup
import re
import csv

def button_clicked(request, filename):
    soup = BeautifulSoup(request)
    table = soup.find('table').find('table') 
    t_rows = table.find_all('tr')

    with open(filename, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)

        for t_row in t_rows:
            rec_as_str = t_row.getText()
            rec_as_str = rec_as_str.strip()
            rec_as_str = rec_as_str.replace('\xa0', '')
            rec_as_str = re.sub('\\n?\s*(\\n)+\s*', '|', rec_as_str)
            if len(rec_as_str) > 0:
                a_list = rec_as_str.split("|")
                spamwriter.writerow(a_list)

Creates a file that looks like:

Period Ending,"Dec 31, 2014","Dec 31, 2013","Dec 31, 2012"
Total Revenue,"4,479,648","3,777,068","3,209,782"
Cost of Revenue,"3,160,470","2,656,189","2,284,485"
Gross Profit,"1,319,178","1,120,879","925,297"
Operating Expenses
Research Development,"148,458","139,193","127,361"
Selling General and Administrative,"456,030","403,772","319,511"
Non Recurring,-,-,-
Others,-,-,-
Total Operating Expenses,-,-,-
Operating Income or Loss,"714,690","577,914","478,425"
Income from Continuing Operations
Total Other Income/Expenses Net,(10),"5,139","7,529"
Earnings Before Interest And Taxes,"710,556","580,639","485,775"
Interest Expense,"11,239","6,210","5,932"
Income Before Tax,"699,317","574,429","479,843"
Income Tax Expense,"245,288","193,360","167,533"
Minority Interest,-,-,-
Net Income From Continuing Ops,"454,029","381,069","312,310"
Non-recurring Events
Discontinued Operations,-,"(3,777)",-
Extraordinary Items,-,-,-
Effect Of Accounting Changes,-,-,-
Other Items,-,-,-
Net Income,"454,029","377,292","312,310"
Preferred Stock And Other Adjustments,-,-,-
Net Income Applicable To Common Shares,"454,029","377,292","312,310"

Upvotes: 1

Related Questions