Sorting XML based on attributes retaining all children nodes for each parent node in Python

Question

I have an xml file which I want to sort based on attribute values. The following is the xml file:



  imglab dataset
  Created by imglab tool.
  
    
      
        groundpainting_hotstar
      
      
        groundpainting_yesbank
      
      
        groundpainting_vodafone
      
    
    
    
      
        sightscreen_pepsi

The desired output is this:



  imglab dataset
  Created by imglab tool.
  
    
    
      
        sightscreen_pepsi
      
    
    
      
        groundpainting_hotstar
      
      
        groundpainting_yesbank
      
      
        groundpainting_vodafone

I tried the following two options:

import xml.etree.ElementTree as ET
tree = ET.parse("finalxml.xml")
container = tree.find("images")
data = []
for elem in container:
    key = elem.findtext("image")
    data.append((key,elem))
data.sort()
container[:] = [item[-1] for item in data]
tree.write("new-data.xml")

This code just realigns the box attributes and not the image file attribute, which is not desirable. The following is something I have taken from SO, but doesn't do anything.

# =======================================================================
# Monkey patch ElementTree
import xml.etree.ElementTree as ET

def _serialize_xml(write, elem, encoding, qnames, namespaces):
    tag = elem.tag
    text = elem.text
    if tag is ET.Comment:
        write("" % ET._encode(text, encoding))
    elif tag is ET.ProcessingInstruction:
        write("" % ET._encode(text, encoding))
    else:
        tag = qnames[tag]
        if tag is None:
            if text:
                write(ET._escape_cdata(text, encoding))
            for e in elem:
                _serialize_xml(write, e, encoding, qnames, None)
        else:
            write("<" + tag)
            items = elem.items()
            if items or namespaces:
                if namespaces:
                    for v, k in sorted(namespaces.items(),
                                       key=lambda x: x[1]):  # sort on prefix
                        if k:
                            k = ":" + k
                        write(" xmlns%s=\"%s\"" % (
                            k.encode(encoding),
                            ET._escape_attrib(v, encoding)
                            ))
                #for k, v in sorted(items):  # lexical order
                for k, v in items: # Monkey patch
                    if isinstance(k, ET.QName):
                        k = k.text
                    if isinstance(v, ET.QName):
                        v = qnames[v.text]
                    else:
                        v = ET._escape_attrib(v, encoding)
                    write(" %s=\"%s\"" % (qnames[k], v))
            if text or len(elem):
                write(">")
                if text:
                    write(ET._escape_cdata(text, encoding))
                for e in elem:
                    _serialize_xml(write, e, encoding, qnames, None)
                write("")
            else:
                write(" />")
    if elem.tail:
        write(ET._escape_cdata(elem.tail, encoding))

ET._serialize_xml = _serialize_xml

from collections import OrderedDict

class OrderedXMLTreeBuilder(ET.XMLTreeBuilder):
    def _start_list(self, tag, attrib_in):
        fixname = self._fixname
        tag = fixname(tag)
        attrib = OrderedDict()
        if attrib_in:
            for i in range(0, len(attrib_in), 2):
                attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
        return self._target.start(tag, attrib)


tree = ET.parse("example1.xml", OrderedXMLTreeBuilder())
tree.write("new-data.xml")

How do I get the xml sorted?

Galen · Accepted Answer

Use the key named argument for list.sort to use the file attribute of each tag as the key for sorting:

key specifies a function of one argument that is used to extract a comparison key from each list element (for example, key=str.lower). The key corresponding to each item in the list is calculated once and then used for the entire sorting process. The default value of None means that list items are sorted directly without calculating a separate key value.

import xml.etree.ElementTree

xml_string = r'''

  imglab dataset
  Created by imglab tool.
  
    
      
        groundpainting_hotstar
      
      
        groundpainting_yesbank
      
      
        groundpainting_vodafone
      
    
    
    
      
        sightscreen_pepsi
      
    
 
'''

root = xml.etree.ElementTree.fromstring(xml_string)
images_root = root.find('images')
images = images_root.findall('image')
images.sort(key = lambda x: x.attrib['file'])
images_root[:] = images

print(xml.etree.ElementTree.tostring(root))

Alternate solution using lxml based off of this answer that points out that lxml serializes attributes in the order they are set (unlike xml):

import lxml.etree

xml_string = r'''

  imglab dataset
  Created by imglab tool.
  
    lol
    
      
        groundpainting_hotstar
      
      
        groundpainting_yesbank
      
      
        groundpainting_vodafone
      
    
    
    
      
        sightscreen_pepsi
      
    
 
'''

root = lxml.etree.fromstring(xml_string)
images_root = root.find('images')
images = images_root.findall('image')
images.sort(key = lambda x: x.attrib['file'])
images_root[:] = images

print(lxml.etree.tostring(root))

Note: This will remove any child (immediate descendant) of that is not a .

Sorting XML based on attributes retaining all children nodes for each parent node in Python

Answers (1)

Related Questions