Chris
Chris

Reputation: 69

Python Beautiful Soup table scrape

I have been looking to scrape a HTML table from an oil production SSRS feed online. I have managed to learn a bit of beautiful soup/python to get to the point I am currently but I think I need a little assistance to just get it finished.

The aim is to scrape the table which is all tagged and output json data. I have a json formatted output but for the 10 headers but it is repeating the same data row cell value per header. I think the iteration through the cells to assign to the headers is the issue. I'm sure it will make sense when run.

Any assistance would be greatly appreciated, trying to learn just what I have done wrong as this is pretty new to me.

Cheers

import json
from bs4 import BeautifulSoup
import urllib.request
import boto3
import botocore

#Url to scrape

url='http://factpages.npd.no/ReportServer?/FactPages/TableView/
    field_production_monthly&rs:Command=Render&rc:Toolbar=
    false&rc:Parameters=f&Top100=True&IpAddress=108.171.128.174&
    CultureCode=en'


#Agent detail to prevent scraping bot detection 
user_agent = 'Mozilla/5(Macintosh; Intel Mac OS X 10_9_3) 
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 
    Safari/537.36'

header = {'User-Agent': user_agent}

#Request url from list above, assign headers from criteria above
req = urllib.request.Request(url, headers = header)

#Open url from the previous request and assign
npddata = urllib.request.urlopen(req, timeout = 20)

#Start soup on url request data
soup = BeautifulSoup(npddata, 'html.parser')

# Scrape the html table variable from selected website 
table = soup.find('table')


headers = {}

col_headers = soup.findAll('tr')[3].findAll('td')

for i in range(len(col_headers)):
    headers[i] = col_headers[i].text.strip()

# print(json.dumps(headers, indent = 4))


cells = {}

rows = soup.findAll('td', {
    'class': ['a61cl', 'a65cr', 'a69cr', 'a73cr', 'a77cr', 'a81cr', 'a85cr', 
    'a89cr', 'a93cr', 'a97cr']})

for row in rows[i]: #remove index!(###ISSUE COULD BE HERE####)

# findall function was original try (replace getText with FindAll to try)

    cells = row.getText('div')


# Attempt to fix, can remove and go back to above
#for i in range(len(rows)): #cells[i] = rows[i].text.strip()


#print(cells)# print(json.dumps(cells, indent = 4))
#print(cells)# print(json.dumps(cells, indent = 4))


data = []

item = {}

for index in headers:
    item[headers[index]] = cells#[index]

# if no getText on line 47 then.text() here### ISSUE COULD BE HERE####

data.append(item)


#print(data)
print(json.dumps(data, indent = 4))
# print(item)# 
print(json.dumps(item, indent = 4))

Upvotes: 1

Views: 910

Answers (1)

Aaditya Ura
Aaditya Ura

Reputation: 12679

There were some errors in your code i fix those errors and modified your code a little :

Is this what you want :

import requests
from bs4 import BeautifulSoup
import json

# Webpage connection
html = "http://factpages.npd.no/ReportServer?/FactPages/TableView/field_production_monthly&rs:Command=Render&rc:Toolbar=false&rc:Parameters=f&Top100=True&IpAddress=108.171.128.174&CultureCode=en"
r=requests.get(html)
c=r.content
soup=BeautifulSoup(c,"html.parser")


rows = soup.findAll('td', {
    'class': ['a61cl', 'a65cr', 'a69cr', 'a73cr', 'a77cr', 'a81cr', 'a85cr',
    'a89cr', 'a93cr', 'a97cr']})

headers = soup.findAll('td', {
    'class': ['a20c','a24c', 'a28c', 'a32c', 'a36c', 'a40c', 'a44c', 'a48c',
    'a52c']})

headers_list = [item.getText('div') for item in headers]

rows_list=[item.getText('div') for item in rows]

final=[rows_list[item:item+9] for item in range(0,len(rows_list),9)]

row_header={}
for item in final:
    for indices in range(0,9):
        if headers_list[indices] not in row_header:
            row_header[headers_list[indices]]=[item[indices]]
        else:
            row_header[headers_list[indices]].append(item[indices])



result=json.dumps(row_header,indent=4)
print(result)

the sample of output:

{
    "Year": [
        "2009",
        "2009",
        "2009",
        "2009",
        "2009",
        "2009",
        "2010",
        "2010",
        "2010",
        "2010",
        "2010",

Upvotes: 1

Related Questions