Reputation: 69
I have been looking to scrape a HTML table from an oil production SSRS feed online. I have managed to learn a bit of beautiful soup/python to get to the point I am currently but I think I need a little assistance to just get it finished.
The aim is to scrape the table which is all tagged and output json data. I have a json formatted output but for the 10 headers but it is repeating the same data row cell value per header. I think the iteration through the cells to assign to the headers is the issue. I'm sure it will make sense when run.
Any assistance would be greatly appreciated, trying to learn just what I have done wrong as this is pretty new to me.
Cheers
import json
from bs4 import BeautifulSoup
import urllib.request
import boto3
import botocore
#Url to scrape
url='http://factpages.npd.no/ReportServer?/FactPages/TableView/
field_production_monthly&rs:Command=Render&rc:Toolbar=
false&rc:Parameters=f&Top100=True&IpAddress=108.171.128.174&
CultureCode=en'
#Agent detail to prevent scraping bot detection
user_agent = 'Mozilla/5(Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47
Safari/537.36'
header = {'User-Agent': user_agent}
#Request url from list above, assign headers from criteria above
req = urllib.request.Request(url, headers = header)
#Open url from the previous request and assign
npddata = urllib.request.urlopen(req, timeout = 20)
#Start soup on url request data
soup = BeautifulSoup(npddata, 'html.parser')
# Scrape the html table variable from selected website
table = soup.find('table')
headers = {}
col_headers = soup.findAll('tr')[3].findAll('td')
for i in range(len(col_headers)):
headers[i] = col_headers[i].text.strip()
# print(json.dumps(headers, indent = 4))
cells = {}
rows = soup.findAll('td', {
'class': ['a61cl', 'a65cr', 'a69cr', 'a73cr', 'a77cr', 'a81cr', 'a85cr',
'a89cr', 'a93cr', 'a97cr']})
for row in rows[i]: #remove index!(###ISSUE COULD BE HERE####)
# findall function was original try (replace getText with FindAll to try)
cells = row.getText('div')
# Attempt to fix, can remove and go back to above
#for i in range(len(rows)): #cells[i] = rows[i].text.strip()
#print(cells)# print(json.dumps(cells, indent = 4))
#print(cells)# print(json.dumps(cells, indent = 4))
data = []
item = {}
for index in headers:
item[headers[index]] = cells#[index]
# if no getText on line 47 then.text() here### ISSUE COULD BE HERE####
data.append(item)
#print(data)
print(json.dumps(data, indent = 4))
# print(item)#
print(json.dumps(item, indent = 4))
Upvotes: 1
Views: 910
Reputation: 12679
There were some errors in your code i fix those errors and modified your code a little :
Is this what you want :
import requests
from bs4 import BeautifulSoup
import json
# Webpage connection
html = "http://factpages.npd.no/ReportServer?/FactPages/TableView/field_production_monthly&rs:Command=Render&rc:Toolbar=false&rc:Parameters=f&Top100=True&IpAddress=108.171.128.174&CultureCode=en"
r=requests.get(html)
c=r.content
soup=BeautifulSoup(c,"html.parser")
rows = soup.findAll('td', {
'class': ['a61cl', 'a65cr', 'a69cr', 'a73cr', 'a77cr', 'a81cr', 'a85cr',
'a89cr', 'a93cr', 'a97cr']})
headers = soup.findAll('td', {
'class': ['a20c','a24c', 'a28c', 'a32c', 'a36c', 'a40c', 'a44c', 'a48c',
'a52c']})
headers_list = [item.getText('div') for item in headers]
rows_list=[item.getText('div') for item in rows]
final=[rows_list[item:item+9] for item in range(0,len(rows_list),9)]
row_header={}
for item in final:
for indices in range(0,9):
if headers_list[indices] not in row_header:
row_header[headers_list[indices]]=[item[indices]]
else:
row_header[headers_list[indices]].append(item[indices])
result=json.dumps(row_header,indent=4)
print(result)
the sample of output:
{
"Year": [
"2009",
"2009",
"2009",
"2009",
"2009",
"2009",
"2010",
"2010",
"2010",
"2010",
"2010",
Upvotes: 1