Pavan Suvarna
Pavan Suvarna

Reputation: 501

how to web scrape HTML table from URL page using Beautiful soup and write it into CSV

I am trying to scrape the HTML table from a page. But I am not getting the proper output ... my sample code

import datetime, time
import os
from bs4 import BeautifulSoup
import requests 

URL = r'https://www.data.bsee.gov/Production/OCSProduction/Default.aspx'

res = requests.get(URL)
soup = BeautifulSoup(res.text,"lxml")
table = soup.find('table',{'class':'dxsplControl'})

list_of_rows = []
for row in table.findAll('tr'):
    list_of_cells = []
    for cell in row.findAll(["td"]):
        text = cell.text
        list_of_cells.append(text)
    list_of_rows.append(list_of_cells)

for item in list_of_rows:
    print(' '.join(item))

I am not sure what is wrong in here as I am new to python, I have copied this code from the web and tried making my own. please help..thanks

This is how my output file should like ,

Year    Alaska        acific    Gulf     Total  
2009    1,823,426   22,306,167  570,309,328 594,438,921 
2010    1,337,999   21,707,342  566,628,383 589,673,724 
2011    1,057,866   19,820,270  481,697,096 502,575,232 
2012    627,108     17,678,493  464,786,485 483,092,086 
2013    669,148     18,565,833  459,046,740 478,281,721 
2014    625,303     18,506,540  510,467,459 529,599,302 
2015    609,912     11,451,040  553,007,049 565,068,001 
2016    546,340     6,142,614   585,353,426 592,042,380 
2017    517,002    5,713,059    613,314,985 619,545,046 
2018    498,216    4,873,812    642,155,689 647,527,717 
    
Year    Alaska       Pacific       Gulf          Total  
2009    29,609,015  41,282,897  2,451,076,806   2,521,968,718   
2010    29,659,633  41,251,142  2,250,426,803   2,321,337,578   
2011    37,801,877  36,591,564  1,826,593,930   1,900,987,371   
2012    21,960,989  27,263,741  1,535,897,665   1,585,122,395   
2013    29,293,586  27,505,401  1,328,279,728   1,385,078,715   
2014    31,264,462  28,313,384  1,276,676,600   1,336,254,446   
2015    32,249,585  14,808,085  1,307,390,047   1,354,447,717   
2016    31,705,685  4,501,303   1,220,581,120   1,256,788,108   
2017    29,056,185  3,949,957   1,078,657,857   1,111,663,999   
2018    3,211,259   3,427,708   993,571,711 1,000,210,678   

Total four tables are there in the single page I need all 4 tables

Upvotes: 0

Views: 146

Answers (1)

Jack Fleeting
Jack Fleeting

Reputation: 24940

The basic problem is that the data on this page is loaded dynamically using jscript, so requests can't handle it directly. Using the developer tab in your browser, you should be able to locate the request header and load the response. It's long and convoluted, but it looks like this:

from bs4 import BeautifulSoup as bs
import requests

cookies = {
    'ASP.NET_SessionId': 'lswqqprulnnbiwaogfdr4gwm',
    'TS0150405d_28': '01637e37f25282f82ade210ba547d80a84eec306ae482b02970c0abf42121713d81f9542bda8cf1201086f223d14bba7cfcce93cf3',
    '__utma': '23215407.1077924861.1577531617.1577531617.1577531617.1',
    '__utmc': '23215407',
    '__utmz': '23215407.1577531617.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
    'TS0150405d': '01b61ca5dd600357b599595716d5f51ae05793bd76c76cfb349561b68c8af4b61a7457db3203499821f2378aa4d705f8dcdcf9549c19e698681a5ec36c03de3fdd99856da6',
    '__utmt': '1',
    '__utmb': '23215407.5.10.1577531617',
}

headers = {
    'Connection': 'keep-alive',
    'Origin': 'https://www.data.bsee.gov',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': '*/*',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.data.bsee.gov/Production/OCSProduction/Default.aspx',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8,he;q=0.7',
}

data = {
  '__EVENTTARGET': '',
  '__EVENTARGUMENT': '',
  '__VIEWSTATE': '/wEPDwUJLTEzNjg1MDkxD2QWAmYPZBYCAgMPZBYMAg0PPCsABAEADxYCHgVWYWx1ZQUuT3V0ZXIgQ29udGluZW50YWwgU2hlbGYgT2lsIGFuZCBHYXMgUHJvZHVjdGlvbmRkAg8PPCsABAEADxYEHwAFLGRhdGEgbGFzdCB1cGRhdGVkOiAgMTItMDItMjAxOSAxMDo1NiBBTShDU1QpHgdWaXNpYmxlZ2RkAhEPFCsABA8WAh8BZ2RkZDwrAAUBABYCHghJbWFnZVVybAUfL2JzZWUvaW1hZ2VzL2ljb25zL3R1dG9yaWFsLnBuZ2QCEw88KwAEAQAPFgQfAAUEbm9uZR8BaGRkAhUPPCsABAEADxYCHwFoZGQCGw9kFgICAQ9kFgJmD2QWAgIBDzwrABMCDhQrAAJkZBIUKwACZGRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYEBTZjdGwwMCRDb250ZW50UGxhY2VIb2xkZXIxJEFTUHhEb2N1bWVudFZpZXdlcjEkU3BsaXR0ZXIFQ2N0bDAwJENvbnRlbnRQbGFjZUhvbGRlcjEkQVNQeERvY3VtZW50Vmlld2VyMSRTcGxpdHRlciRUb29sYmFyJE1lbnUFXGN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcjEkQVNQeERvY3VtZW50Vmlld2VyMSRTcGxpdHRlciRUb29sYmFyJE1lbnUkSVRDTlQ1JFRDJFBhZ2VOdW1iZXIkREREBV1jdGwwMCRDb250ZW50UGxhY2VIb2xkZXIxJEFTUHhEb2N1bWVudFZpZXdlcjEkU3BsaXR0ZXIkVG9vbGJhciRNZW51JElUQ05UMTEkVEMkU2F2ZUZvcm1hdCRERETbZJk56ZSqsovdX30xSaDVrByvbw0WKagiU8imDl5u7w==',
  '__VIEWSTATEGENERATOR': '92AE0737',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_CS': '[{"st":"px","s":30,"c":0,"spt":0,"spl":0},{"i":[{"s":100,"st":"%","c":0,"spt":0,"spl":0},{"st":"px","s":195,"c":1,"i":[{},{"c":1}]}],"s":234,"st":"px","c":0}]',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT5_PageNumber_VI': '',
  'ctl00$ContentPlaceHolder1$ASPxDocumentViewer1$Splitter$Toolbar$Menu$ITCNT5$TC$PageNumber': '',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT5_PageNumber_DDDWS': '0:0:-1:-10000:-10000:0:-10000:-10000:1:0:0:0',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT5_PageNumber_DDD_LDeletedItems': '',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT5_PageNumber_DDD_LInsertedItems': '',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT5_PageNumber_DDD_LCustomCallback': '',
  'ctl00$ContentPlaceHolder1$ASPxDocumentViewer1$Splitter$Toolbar$Menu$ITCNT5$TC$PageNumber$DDD$L': '',
  'ctl00$ContentPlaceHolder1$ASPxDocumentViewer1$Splitter$Toolbar$Menu$ITCNT6$TC$PageCount': '1',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT11_SaveFormat_VI': 'pdf',
  'ctl00$ContentPlaceHolder1$ASPxDocumentViewer1$Splitter$Toolbar$Menu$ITCNT11$TC$SaveFormat': 'Pdf',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_Toolbar_Menu_ITCNT11_SaveFormat_DDDWS': '0:0:-1:-10000:-10000:0:-10000:-10000:1:0:0:0',
  'ctl00$ContentPlaceHolder1$ASPxDocumentViewer1$Splitter$Toolbar$Menu$ITCNT11$TC$SaveFormat$DDD$L': 'pdf',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_ViewerDXCurrentPageIndex': '0',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_ViewerDXCacheKey': '',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_ViewerDXParameters': '',
  'ContentPlaceHolder1_ASPxDocumentViewer1_Splitter_ViewerDXRemote': '',
  'DXScript': '1_171,1_94,9_13,9_10,9_8,1_164,1_163,9_12,1_91,1_156,1_162,1_147,1_114,1_121,1_113,1_154,1_116,9_9',
  'DXCss': '1_4,1_12,1_5,1_3,9_1,9_0,1_10,1_1,9_15,/bsee/images/icons/favicon.ico,/bsee/css/bsee-omega.normalize.css,/bsee/css/bsee-data.styles.css,/bsee/css/shared2.css,/bsee/css/print-home.css',
  '__CALLBACKID': 'ctl00$ContentPlaceHolder1$ASPxDocumentViewer1',
  '__CALLBACKPARAM': 'c0:page=',
  '__EVENTVALIDATION': '/wEdAATKS2LB9FVlH2d4YygEMTsmyfTLKEoPzNfdNhCl7u7HwTUjb+lHE27hg3/NFaBFsDfbbKHXuEdfelZR8GZlxl53eRZ6HcPvKP/62aL4PyBUNwxqU80mnvmx3iUhwGUrLFM='
}

res = requests.post('https://www.data.bsee.gov/Production/OCSProduction/Default.aspx', headers=headers, cookies=cookies, data=data)

soup = bs(res.text,"lxml")
table = soup.find('table')

From this point on, your code can take over:

list_of_rows = []
for row in table.findAll('tr'):
    list_of_cells = []
    for cell in row.findAll(["td"]):
        text = cell.text
        list_of_cells.append(text)
    list_of_rows.append(list_of_cells)

for item in list_of_rows:
    print(' '.join(item))

And the output is the four tables on the page.

Upvotes: 2

Related Questions