Reputation: 1869
I have a program that extracts certain variables from a group of 20 html files. Can someone give me advice on how to loop the program to read all the html files from a directory and print the information in individual json documents?
from bs4 import BeautifulSoup
#opens data file
get_data = open("book1.html",'r').read()
#parses the html
soup = BeautifulSoup(get_data)
# finds title and author
title = soup.find("span", id="btAsinTitle")
author = title.find_next("a", href=True)
# finds price
for definition in soup.findAll('span', {"class":'bb_price'}):
definition = definition.renderContents()
#finds ISBN, Shipping Weight, Product Dimensions
print soup.find('b', text='ISBN-10:').next_sibling
print soup.find('b', text='Shipping Weight:').next_sibling
#prints all the information
print definition
print title.get_text()
print author.get_text()
Upvotes: 1
Views: 1349
Reputation: 44112
To process set of files in some directory:
from glob import glob
fnames = glob("datadir/*.html")
for fname in fnames:
html2json(fname)
Now we need the function html2json, it shall get name of html file and will write json string to a file with the same name as has the html, but with added json extension.
import json
from bs4 import BeautifulSoup
def html2json(fname):
resdct = {}
with open(fname) as f:
soup = BeautifulSoup(f)
title = soup.find("span", id="btAsinTitle")
resdct["title"] = title.get_text()
resdct["author"] = title.find_next("a", href=True).get_text()
resdct["isbn"] = soup.find('b', text='ISBN-10:').next_sibling.get_text()
resdct["weight"] = soup.find('b', text='Shipping Weight:').next_sibling.get_text()
outfname = fname + ".json"
with open(outfname, "w") as f:
json.dump(resdct, f)
Upvotes: 2
Reputation: 473873
You can use glob.iglob
to loop through all html
files in a directory. For every filename, pass the file-like object to the BeautifulSoup
constructor, get the elements you need and construct a dictionary:
import glob
from bs4 import BeautifulSoup
for filename in glob.iglob('*.html'):
with open(filename) as f:
soup = BeautifulSoup(f)
title = soup.find("span", id="btAsinTitle")
author = title.find_next("a", href=True)
isbn = soup.find('b', text='ISBN-10:').next_sibling
weight = soup.find('b', text='Shipping Weight:').next_sibling
print {'title': title.get_text(),
'author': author.get_text(),
'isbn': isbn,
'weight': weight}
Upvotes: 2