Mihail-Cosmin Munteanu
Mihail-Cosmin Munteanu

Reputation: 522

How to sort XML by attribute (strings before numbers) in Python

I would like to sort the below xml, by the attribute "value" of the "entry" tags and sort the strings (letters) before the numbers.

<test>
    <entry value="-12" />
    <entry value="0" />
    <entry value="043" />
    <entry value="14" />
    <entry value="6" />
    <entry value="_null" />
    <entry value="abc" />
    <entry value="abcd" />
    <entry value="empty" />
    <entry value="false" />
    <entry value="test1" />
    <entry value="test2" />
    <entry value="true" />
</test>

I have written some python that sorts this xml, but it sorts first the numbers and then the strings. I have checked this thread, but could not implement any of the solutions to sorting XML.

import xml.etree.ElementTree as ElT
import os
from os.path import sep

def sort_xml(directory, xml_file, level1_tag, attribute, mode=0):
    #mode 0 - numbers before letters
    #mode 1 - letters before numbers

    file = directory + sep + xml_file

    tree = ElT.parse(file)
    data = tree.getroot()
    els = data.findall(level1_tag)
    
    if mode == 0:
        new_els = sorted(els, key=lambda e: (e.tag, e.attrib[attribute]))
    if mode == 1:
        new_els = sorted(els, key=lambda e: (isinstance(e.tag, (float, int)), e.attrib[attribute]))

    for el in new_els:
        if mode == 0:
            el[:] = sorted(el, key=lambda e: (e.tag, e.attrib[attribute]))
        if mode == 1:
            el[:] = sorted(el, key=lambda e: (isinstance(e.tag, (float, int)), e.attrib[attribute]))
    
    data[:] = new_els

    tree.write(file, xml_declaration=True, encoding='utf-8')

    with open(file, 'r') as fin:
        data = fin.read().splitlines(True)
    with open(file, 'w') as fout:
        fout.writelines(data[1:])
        
        
sort_xml(os.getcwd(), "test.xml", "entry", "value", 1)

Any ideas how this could be done?

Edit1: Desired output

<test>
    <entry value="_null" />
    <entry value="abc" />
    <entry value="abcd" />
    <entry value="empty" />
    <entry value="false" />
    <entry value="test1" />
    <entry value="test2" />
    <entry value="true" />
    <entry value="-12" />
    <entry value="0" />
    <entry value="043" />
    <entry value="14" />
    <entry value="6" />
</test>

Upvotes: 0

Views: 545

Answers (2)

balderman
balderman

Reputation: 23825

I took the part where the letters start and put it at the top. This the actual requirement to have the letters at the top, I don't care about the rest.

below

 import xml.etree.ElementTree as ET

xml = '''<test>
    <entry value="-12" />
    <entry value="/this" />
    <entry value="0" />
    <entry value="043" />
    <entry value="14" />
    <entry value="6" />
    <entry value="_null" />
    <entry value="abc" />
    <entry value="abcd" />
    <entry value="empty" />
    <entry value="false" />
    <entry value="test1" />
    <entry value="test2" />
    <entry value="true" />
</test>'''

root = ET.fromstring(xml)
numeric = []
non_numeric = []
for entry in root.findall('.//entry'):
    try:
        x = int(entry.attrib['value'])
        numeric.append((x, entry.attrib['value']))
    except ValueError as e:
        non_numeric.append(entry.attrib['value'])

sorted(numeric, key=lambda x: x[0])
sorted(non_numeric)

root = ET.Element('test')
for value in non_numeric:
    entry = ET.SubElement(root, 'entry')
    entry.attrib['value'] = value
for value in numeric:
    entry = ET.SubElement(root, 'entry')
    entry.attrib['value'] = str(value[1])
ET.dump(root)

output

 <?xml version="1.0" encoding="UTF-8"?>
<test>
   <entry value="/this" />
   <entry value="_null" />
   <entry value="abc" />
   <entry value="abcd" />
   <entry value="empty" />
   <entry value="false" />
   <entry value="test1" />
   <entry value="test2" />
   <entry value="true" />
   <entry value="-12" />
   <entry value="0" />
   <entry value="043" />
   <entry value="14" />
   <entry value="6" />
</test>

Upvotes: 1

tomjn
tomjn

Reputation: 5389

I think your problem is that when you are sorting you are checking if the value is an int or float. In fact all the values are strings e.g. isinstance(e.tag, (float, int)) will always be false.

A sorter function like this does what you want

def sorter(x):
    "Check if the value can be interpreted as an integer, then by the string"
    value = x.get("value") 
    def is_integer(i):
        try:
            int(i)
        except ValueError:
            return False
        return True
    return is_integer(value), value

which can be used like so (using StringIO as a substitute for the file)

from xml.etree import ElementTree
from io import StringIO

xml = """<test>
    <entry value="-12" />
    <entry value="0" />
    <entry value="043" />
    <entry value="14" />
    <entry value="6" />
    <entry value="_null" />
    <entry value="abc" />
    <entry value="abcd" />
    <entry value="empty" />
    <entry value="false" />
    <entry value="test1" />
    <entry value="test2" />
    <entry value="true" />
</test>"""

tree = ElementTree.parse(StringIO(xml))
root = tree.getroot()
root[:] = sorted(root, key=sorter)
tree.write("output.xml")

The contents of output.xml is

<test>
    <entry value="_null" />
    <entry value="abc" />
    <entry value="abcd" />
    <entry value="empty" />
    <entry value="false" />
    <entry value="test1" />
    <entry value="test2" />
    <entry value="true" />
    <entry value="-12" />
    <entry value="0" />
    <entry value="043" />
    <entry value="14" />
    <entry value="6" />
</test>

Upvotes: 1

Related Questions