popeye
popeye

Reputation: 291

Sorting XML based on attributes retaining all children nodes for each parent node in Python

I have an xml file which I want to sort based on attribute values. The following is the xml file:

<?xml-stylesheet type='text/xsl' href='image_metadata_stylesheet.xsl'?>
<dataset>
  <name>imglab dataset</name>
  <comment>Created by imglab tool.</comment>
  <images>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00003.jpg">
      <box top="175" left="59" width="73" height="29">
        <label>groundpainting_hotstar</label>
      </box>
      <box top="174" left="205" width="56" height="24">
        <label>groundpainting_yesbank</label>
      </box>
      <box top="170" left="141" width="44" height="32">
        <label>groundpainting_vodafone</label>
      </box>
    </image>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00001.jpg"/>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00002.jpg">
      <box top="198" left="17" width="32" height="10">
        <label>sightscreen_pepsi</label>
      </box>
    </image>
 </images>
</dataset>

The desired output is this:

<?xml-stylesheet type='text/xsl' href='image_metadata_stylesheet.xsl'?>
<dataset>
  <name>imglab dataset</name>
  <comment>Created by imglab tool.</comment>
  <images>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00001.jpg"/>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00002.jpg">
      <box top="198" left="17" width="32" height="10">
        <label>sightscreen_pepsi</label>
      </box>
    </image>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00003.jpg">
      <box top="175" left="59" width="73" height="29">
        <label>groundpainting_hotstar</label>
      </box>
      <box top="174" left="205" width="56" height="24">
        <label>groundpainting_yesbank</label>
      </box>
      <box top="170" left="141" width="44" height="32">
        <label>groundpainting_vodafone</label>
      </box>
    </image>
 </images>
</dataset>

I tried the following two options:

import xml.etree.ElementTree as ET
tree = ET.parse("finalxml.xml")
container = tree.find("images")
data = []
for elem in container:
    key = elem.findtext("image")
    data.append((key,elem))
data.sort()
container[:] = [item[-1] for item in data]
tree.write("new-data.xml")

This code just realigns the box attributes and not the image file attribute, which is not desirable. The following is something I have taken from SO, but doesn't do anything.

# =======================================================================
# Monkey patch ElementTree
import xml.etree.ElementTree as ET

def _serialize_xml(write, elem, encoding, qnames, namespaces):
    tag = elem.tag
    text = elem.text
    if tag is ET.Comment:
        write("<!--%s-->" % ET._encode(text, encoding))
    elif tag is ET.ProcessingInstruction:
        write("<?%s?>" % ET._encode(text, encoding))
    else:
        tag = qnames[tag]
        if tag is None:
            if text:
                write(ET._escape_cdata(text, encoding))
            for e in elem:
                _serialize_xml(write, e, encoding, qnames, None)
        else:
            write("<" + tag)
            items = elem.items()
            if items or namespaces:
                if namespaces:
                    for v, k in sorted(namespaces.items(),
                                       key=lambda x: x[1]):  # sort on prefix
                        if k:
                            k = ":" + k
                        write(" xmlns%s=\"%s\"" % (
                            k.encode(encoding),
                            ET._escape_attrib(v, encoding)
                            ))
                #for k, v in sorted(items):  # lexical order
                for k, v in items: # Monkey patch
                    if isinstance(k, ET.QName):
                        k = k.text
                    if isinstance(v, ET.QName):
                        v = qnames[v.text]
                    else:
                        v = ET._escape_attrib(v, encoding)
                    write(" %s=\"%s\"" % (qnames[k], v))
            if text or len(elem):
                write(">")
                if text:
                    write(ET._escape_cdata(text, encoding))
                for e in elem:
                    _serialize_xml(write, e, encoding, qnames, None)
                write("</" + tag + ">")
            else:
                write(" />")
    if elem.tail:
        write(ET._escape_cdata(elem.tail, encoding))

ET._serialize_xml = _serialize_xml

from collections import OrderedDict

class OrderedXMLTreeBuilder(ET.XMLTreeBuilder):
    def _start_list(self, tag, attrib_in):
        fixname = self._fixname
        tag = fixname(tag)
        attrib = OrderedDict()
        if attrib_in:
            for i in range(0, len(attrib_in), 2):
                attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
        return self._target.start(tag, attrib)


tree = ET.parse("example1.xml", OrderedXMLTreeBuilder())
tree.write("new-data.xml")

How do I get the xml sorted?

Upvotes: 0

Views: 551

Answers (1)

Galen
Galen

Reputation: 1307

Use the key named argument for list.sort to use the file attribute of each <image> tag as the key for sorting:

key specifies a function of one argument that is used to extract a comparison key from each list element (for example, key=str.lower). The key corresponding to each item in the list is calculated once and then used for the entire sorting process. The default value of None means that list items are sorted directly without calculating a separate key value.

import xml.etree.ElementTree

xml_string = r'''<?xml-stylesheet type='text/xsl' href='image_metadata_stylesheet.xsl'?>
<dataset>
  <name>imglab dataset</name>
  <comment>Created by imglab tool.</comment>
  <images>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00003.jpg">
      <box top="175" left="59" width="73" height="29">
        <label>groundpainting_hotstar</label>
      </box>
      <box top="174" left="205" width="56" height="24">
        <label>groundpainting_yesbank</label>
      </box>
      <box top="170" left="141" width="44" height="32">
        <label>groundpainting_vodafone</label>
      </box>
    </image>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00001.jpg"/>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00002.jpg">
      <box top="198" left="17" width="32" height="10">
        <label>sightscreen_pepsi</label>
      </box>
    </image>
 </images>
</dataset>'''

root = xml.etree.ElementTree.fromstring(xml_string)
images_root = root.find('images')
images = images_root.findall('image')
images.sort(key = lambda x: x.attrib['file'])
images_root[:] = images

print(xml.etree.ElementTree.tostring(root))

Alternate solution using lxml based off of this answer that points out that lxml serializes attributes in the order they are set (unlike xml):

import lxml.etree

xml_string = r'''<?xml-stylesheet type='text/xsl' href='image_metadata_stylesheet.xsl'?>
<dataset>
  <name>imglab dataset</name>
  <comment>Created by imglab tool.</comment>
  <images>
    <text>lol</text>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00003.jpg">
      <box top="175" left="59" width="73" height="29">
        <label>groundpainting_hotstar</label>
      </box>
      <box top="174" left="205" width="56" height="24">
        <label>groundpainting_yesbank</label>
      </box>
      <box top="170" left="141" width="44" height="32">
        <label>groundpainting_vodafone</label>
      </box>
    </image>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00001.jpg"/>
    <image file="/home/iris/Documents/SONY_MAX-20150408-200026-210358-00002.jpg">
      <box top="198" left="17" width="32" height="10">
        <label>sightscreen_pepsi</label>
      </box>
    </image>
 </images>
</dataset>'''

root = lxml.etree.fromstring(xml_string)
images_root = root.find('images')
images = images_root.findall('image')
images.sort(key = lambda x: x.attrib['file'])
images_root[:] = images

print(lxml.etree.tostring(root))

Note: This will remove any child (immediate descendant) of <images> that is not a <image>.

Upvotes: 1

Related Questions