Prog1020
Prog1020

Reputation: 4781

How to get position of opening/ending HTML tag in Python

How to solve this on Python3, using what lib, and using what sample code?

I have html file, at position Line:Col I have middle of html tag <table ......>; how to get position of <table> tag edges (brackets < >) and position of its </table> tag edges?

(note: several table tags may be one inside another).

Upvotes: 0

Views: 1351

Answers (2)

Austin Bravo
Austin Bravo

Reputation: 1

This gets you the coordinates of each tag with html.parser, where I monkeypatch the goahead function with a simple modification, calling the custom method get_endpos:

from html.parser import HTMLParser, starttagopen
from html import unescape

class MyHTMLParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)

        self.start_tags = []
        self.end_tags = []

        self.last_append = []

    def handle_starttag(self, tag, attrs):
        self.start_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))

        self.last_append = self.start_tags

    def handle_endtag(self, tag):
        self.end_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))

        self.last_append = self.end_tags

    def get_endpos(self):
        self.last_append[-1] = self.last_append[-1] + ((self.getpos()[0], self.getpos()[1]),)

    def get_tags(self):
        return self.start_tags, self.end_tags

    def _reset(self):
        HTMLParser.reset(self)

        self.start_tags = []
        self.end_tags = []

parser = MyHTMLParser()

# Internal -- handle data as far as reasonable.  May leave state
# and data to be processed by a subsequent call.  If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
    rawdata = self.rawdata
    i = 0
    n = len(rawdata)
    while i < n:
        if self.convert_charrefs and not self.cdata_elem:
            j = rawdata.find('<', i)
            if j < 0:
                # if we can't find the next <, either we are at the end
                # or there's more text incoming.  If the latter is True,
                # we can't pass the text to handle_data in case we have
                # a charref cut in half at end.  Try to determine if
                # this is the case before proceeding by looking for an
                # & near the end and see if it's followed by a space or ;.
                amppos = rawdata.rfind('&', max(i, n-34))
                if (amppos >= 0 and
                    not re.compile(r'[\s;]').search(rawdata, amppos)):
                    break  # wait till we get all the text
                j = n
        else:
            match = self.interesting.search(rawdata, i)  # < or &
            if match:
                j = match.start()
            else:
                if self.cdata_elem:
                    break
                j = n
        if i < j:
            if self.convert_charrefs and not self.cdata_elem:
                self.handle_data(unescape(rawdata[i:j]))
            else:
                self.handle_data(rawdata[i:j])
        i = self.updatepos(i, j)
        if i == n: break
        startswith = rawdata.startswith
        if startswith('<', i):
            if starttagopen.match(rawdata, i): # < + letter
                k = self.parse_starttag(i)
            elif startswith("</", i):
                k = self.parse_endtag(i)
            elif startswith("<!--", i):
                k = self.parse_comment(i)
            elif startswith("<?", i):
                k = self.parse_pi(i)
            elif startswith("<!", i):
                k = self.parse_html_declaration(i)
            elif (i + 1) < n:
                self.handle_data("<")
                k = i + 1
            else:
                break
            if k < 0:
                if not end:
                    break
                k = rawdata.find('>', i + 1)
                if k < 0:
                    k = rawdata.find('<', i + 1)
                    if k < 0:
                        k = i + 1
                else:
                    k += 1
                if self.convert_charrefs and not self.cdata_elem:
                    self.handle_data(unescape(rawdata[i:k]))
                else:
                    self.handle_data(rawdata[i:k])
            i = self.updatepos(i, k)
            self.get_endpos() # only modification: gets end position of tags
        elif startswith("&#", i):
            match = charref.match(rawdata, i)
            if match:
                name = match.group()[2:-1]
                self.handle_charref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            else:
                if ";" in rawdata[i:]:  # bail by consuming &#
                    self.handle_data(rawdata[i:i+2])
                    i = self.updatepos(i, i+2)
                break
        elif startswith('&', i):
            match = entityref.match(rawdata, i)
            if match:
                name = match.group(1)
                self.handle_entityref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            match = incomplete.match(rawdata, i)
            if match:
                # match.group() will contain at least 2 chars
                if end and match.group() == rawdata[i:]:
                    k = match.end()
                    if k <= i:
                        k = n
                    i = self.updatepos(i, i + 1)
                # incomplete
                break
            elif (i + 1) < n:
                # not the end of the buffer, and can't be confused
                # with some other construct
                self.handle_data("&")
                i = self.updatepos(i, i + 1)
            else:
                break
        else:
            assert 0, "interesting.search() lied"
    # end while
    if end and i < n and not self.cdata_elem:
        if self.convert_charrefs and not self.cdata_elem:
            self.handle_data(unescape(rawdata[i:n]))
        else:
            self.handle_data(rawdata[i:n])
        i = self.updatepos(i, n)
    self.rawdata = rawdata[i:]

MyHTMLParser.goahead = goahead

parser.feed(your_html_file_as_a_string)
print(parser.get_tags())

Upvotes: 0

J&#233;r&#244;me Radix
J&#233;r&#244;me Radix

Reputation: 10533

Like said in this SO answer, you should not use regex to parse an HTML file as the standard is highly irregular. You should instead use an HTML parsing library like html.parser : This library offers you HTMLParser.getpos() which returns you the line number and offset of the tag.

Upvotes: 1

Related Questions