Clexx
Clexx

Reputation: 13

How can I efficiently parse HTML in Python?

I want to parse the HTML code efficiently without an external library.

I have already tried with a for loop which checks which symbol it is:

list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
    if a == "<":
        m = 1
        list.append([])
    elif a == ">":
        m = 0
        list.append([])
    else:
        list[-1] = a
print(list)

But the code was very slow on 50KB files.

Upvotes: 1

Views: 367

Answers (1)

Noctis Skytower
Noctis Skytower

Reputation: 22041

May I recommend starting with a simple HTML parser like the one shown below? It uses the standard library that comes with Python and has no external dependencies. You may need to alter and extend it according to your needs, but it gives you a basic DOM API that should be a good beginning point to work from. The code works for the simple case it is meant to tackle; but depending on your needs, you may need to add further functionality to accomplish whatever your end goal may be.

#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom


def main():
    # noinspection PyPep8
    document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
'''
    parser = DocumentParser()
    parser.feed(document)
    parser.close()
    model = parser.document.documentElement
    model.normalize()
    print(model.toprettyxml())
    first_title = model.getElementsByTagName('title')[0]
    print(first_title.toxml())
    print(first_title.tagName)
    print(first_title.firstChild.data)
    print(first_title.parentNode.tagName)
    first_p = model.getElementsByTagName('p')[0]
    print(first_p.toxml())
    print(first_p.getAttribute('class'))
    all_a = model.getElementsByTagName('a')
    print(all_a[0].toxml())
    pprint.pprint([element.toxml() for element in all_a])
    pprint.pprint([element.toxml() for element in find(model, id='link3')])
    for element in all_a:
        print(element.getAttribute('href'))
    print(*get_text(model), sep='\n')


class DocumentParser(html.parser.HTMLParser):
    # noinspection SpellCheckingInspection
    def __init__(self, *, convert_charrefs=True):
        super().__init__(convert_charrefs=convert_charrefs)
        self.document = self.focus = xml.dom.minidom.DOMImplementation() \
            .createDocument(None, None, None)

    @property
    def document_has_focus(self):
        return self.document is self.focus

    def handle_starttag(self, tag, attrs):
        element = self.document.createElement(tag)
        for name, value in attrs:
            element.setAttribute(name, value)
        self.focus.appendChild(element)
        self.focus = element

    def handle_endtag(self, tag):
        while self.focus.tagName != tag:
            self.focus = self.focus.parentNode
        self.focus = self.focus.parentNode

    def handle_data(self, data):
        if not self.document_has_focus and not data.isspace():
            self.focus.appendChild(self.document.createTextNode(data.strip()))

    def error(self, message):
        raise RuntimeError(message)

    def close(self):
        super().close()
        while not self.document_has_focus:
            self.focus = self.focus.parentNode


def find(element, **kwargs):
    get_attribute = getattr(element, 'getAttribute', None)
    if get_attribute and \
            all(get_attribute(key) == value for key, value in kwargs.items()):
        yield element
    for child in element.childNodes:
        yield from find(child, **kwargs)


def get_nodes_by_type(node, node_type):
    if node.nodeType == node_type:
        yield node
    for child in node.childNodes:
        yield from get_nodes_by_type(child, node_type)


def get_text(node):
    return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))


if __name__ == '__main__':
    main()

Upvotes: 1

Related Questions