moorepants
moorepants

Reputation: 1869

How can I retain ASCII hex code points when writing an ElementTree in Python?

I've loaded an xml file (Rhythmbox's database file) into Python 3 via the ElementTree parser. After modifying the tree and writing it to disk (ElementTree.write()) using the ascii encoding all of the ASCII hex characters that are in hex code point are converted to ASCII decimal code point. For example here is a diff containing the copyright symbol:

<     <copyright>&#xA9; WNYC</copyright>
---
>     <copyright>&#169; WNYC</copyright>

Is there any way to tell Python/ElementTree not to do this? I'd like all the hex codes to stay in hex code point.

Upvotes: 1

Views: 583

Answers (1)

moorepants
moorepants

Reputation: 1869

I found a solution. First I created a new codec error handler and then monkey patched ElementTree._get_writer() to use the new error handler. Looks like:

from xml.etree import ElementTree
import io
import contextlib
import codecs


def lower_first(s):
    return s[:1].lower() + s[1:] if s else ''


def html_replace(exc):
    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
        s = []
        for c in exc.object[exc.start:exc.end]:
            s.append('&#%s;' % lower_first(hex(ord(c))[1:].upper()))
        return ''.join(s), exc.end
    else:
        raise TypeError("can't handle %s" % exc.__name__)

codecs.register_error('html_replace', html_replace)


# monkey patch this python function to prevent it from using xmlcharrefreplace
@contextlib.contextmanager
def _get_writer(file_or_filename, encoding):
    # returns text write method and release all resources after using
    try:
        write = file_or_filename.write
    except AttributeError:
        # file_or_filename is a file name
        if encoding == "unicode":
            file = open(file_or_filename, "w")
        else:
            file = open(file_or_filename, "w", encoding=encoding,
                        errors="html_replace")
        with file:
            yield file.write
    else:
        # file_or_filename is a file-like object
        # encoding determines if it is a text or binary writer
        if encoding == "unicode":
            # use a text writer as is
            yield write
        else:
            # wrap a binary writer with TextIOWrapper
            with contextlib.ExitStack() as stack:
                if isinstance(file_or_filename, io.BufferedIOBase):
                    file = file_or_filename
                elif isinstance(file_or_filename, io.RawIOBase):
                    file = io.BufferedWriter(file_or_filename)
                    # Keep the original file open when the BufferedWriter is
                    # destroyed
                    stack.callback(file.detach)
                else:
                    # This is to handle passed objects that aren't in the
                    # IOBase hierarchy, but just have a write method
                    file = io.BufferedIOBase()
                    file.writable = lambda: True
                    file.write = write
                    try:
                        # TextIOWrapper uses this methods to determine
                        # if BOM (for UTF-16, etc) should be added
                        file.seekable = file_or_filename.seekable
                        file.tell = file_or_filename.tell
                    except AttributeError:
                        pass
                file = io.TextIOWrapper(file,
                                        encoding=encoding,
                                        errors='html_replace',
                                        newline="\n")
                # Keep the original file open when the TextIOWrapper is
                # destroyed
                stack.callback(file.detach)
                yield file.write

ElementTree._get_writer = _get_writer

Upvotes: 1

Related Questions