Reputation: 13
I want to parse the HTML code efficiently without an external library.
I have already tried with a for
loop which checks which symbol it is:
list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
if a == "<":
m = 1
list.append([])
elif a == ">":
m = 0
list.append([])
else:
list[-1] = a
print(list)
But the code was very slow on 50KB files.
Upvotes: 1
Views: 367
Reputation: 22041
May I recommend starting with a simple HTML parser like the one shown below? It uses the standard library that comes with Python and has no external dependencies. You may need to alter and extend it according to your needs, but it gives you a basic DOM API that should be a good beginning point to work from. The code works for the simple case it is meant to tackle; but depending on your needs, you may need to add further functionality to accomplish whatever your end goal may be.
#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom
def main():
# noinspection PyPep8
document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
parser = DocumentParser()
parser.feed(document)
parser.close()
model = parser.document.documentElement
model.normalize()
print(model.toprettyxml())
first_title = model.getElementsByTagName('title')[0]
print(first_title.toxml())
print(first_title.tagName)
print(first_title.firstChild.data)
print(first_title.parentNode.tagName)
first_p = model.getElementsByTagName('p')[0]
print(first_p.toxml())
print(first_p.getAttribute('class'))
all_a = model.getElementsByTagName('a')
print(all_a[0].toxml())
pprint.pprint([element.toxml() for element in all_a])
pprint.pprint([element.toxml() for element in find(model, id='link3')])
for element in all_a:
print(element.getAttribute('href'))
print(*get_text(model), sep='\n')
class DocumentParser(html.parser.HTMLParser):
# noinspection SpellCheckingInspection
def __init__(self, *, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.document = self.focus = xml.dom.minidom.DOMImplementation() \
.createDocument(None, None, None)
@property
def document_has_focus(self):
return self.document is self.focus
def handle_starttag(self, tag, attrs):
element = self.document.createElement(tag)
for name, value in attrs:
element.setAttribute(name, value)
self.focus.appendChild(element)
self.focus = element
def handle_endtag(self, tag):
while self.focus.tagName != tag:
self.focus = self.focus.parentNode
self.focus = self.focus.parentNode
def handle_data(self, data):
if not self.document_has_focus and not data.isspace():
self.focus.appendChild(self.document.createTextNode(data.strip()))
def error(self, message):
raise RuntimeError(message)
def close(self):
super().close()
while not self.document_has_focus:
self.focus = self.focus.parentNode
def find(element, **kwargs):
get_attribute = getattr(element, 'getAttribute', None)
if get_attribute and \
all(get_attribute(key) == value for key, value in kwargs.items()):
yield element
for child in element.childNodes:
yield from find(child, **kwargs)
def get_nodes_by_type(node, node_type):
if node.nodeType == node_type:
yield node
for child in node.childNodes:
yield from get_nodes_by_type(child, node_type)
def get_text(node):
return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))
if __name__ == '__main__':
main()
Upvotes: 1