Sri
Sri

Reputation: 85

Generating dynamic XML from Python dataframe

Data:
name    phone           email       address   address1   address2 dateofbirth
John    111111111   [email protected]  Chicago    Illinois  Phoenix    
Mark    222222222   [email protected]  London     

My code:
def convert_row(row):
return """<Document>
<Name>%s</Name>
<Phone>%s</Phone>
<Email>%s</Email>
<Address>%s</Address>
<Address1>%s</Address1>
<Address2>%s</Address2>
<DateofBirth>%s</DateofBirth>
</Document>""" % (row.name, row.phone, row.email, row.address, 
row.address1, 
row.address2,row.dob)
with open('out.xml', 'w') as f:
f.write('\n'.join(df.apply(convert_row, axis=1)))

Current output:
<Customer>
<Name>John</Name>
<Phone>111111111</Phone>
<Email>[email protected]</Email>
<Address>Chicago</Address>
<Address1>Illinois</Address1>
<Address2>Phoenix</Address2>
<DateofBirth></DateofBirth>
</Customer>
<Customer>
<Name>Mark</Name>
<Phone>222222222</Phone>
<Email>[email protected]</Email>
<Address>London</Address>
<Address1></Address1>
<Address2></Address2>
<DateofBirth></DateofBirth>
</Customer>

Could anyone please help me how to remove only Address1 and Address2 tags when the value is null (e.g. customer Mark). However empty dateofbirth tag must be retained though it's null. Any help much appreciated!

Upvotes: 1

Views: 59

Answers (2)

Hermann12
Hermann12

Reputation: 3581

You can remove the empty tags after creation:

import pandas as pd
import xml.etree.ElementTree as ET
  
columns = ['Name','Phone','Email','Address','Address1','Address2','DateofBirth']
row = [['John','111111111','[email protected]','Chicago','Illinois','Phoenix',''],['Mark', '222222222', '[email protected]', 'London','','','']]
df = pd.DataFrame(row, columns=columns)
print(df.to_string(index=False))

xml = df.to_xml(root_name='Document', row_name='Customer', index=False, encoding='utf-8', xml_declaration=True, pretty_print=True)
tree = ET.fromstring(xml)

def rem (rem_list, root):
    """ Remove listed empty tags from root"""
    parent_map = {(c, p) for p in root.iter( ) for c in p}
    for (c, p) in parent_map:
        if c.tag in rem_list:
            p.remove(c)

sel = ['Address1', 'Address2']
for customer in tree.findall('.//Customer'):
    rem_list = []
    for empty_tag in customer.iter():
        if empty_tag.text == None and empty_tag.tag in sel:
            rem_list.append(empty_tag.tag)

    # Call remove function
    rem(rem_list, customer)
    rem_list = []

ET.dump(tree)

tree1 = ET.ElementTree(tree)
ET.indent(tree1, space= '  ')
tree1.write('customer.xml', encoding="utf-8", xml_declaration=True)

Output:

Name     Phone          Email Address Address1 Address2 Dateofbirth
John 111111111 [email protected] Chicago Illinois  Phoenix            
Mark 222222222 [email protected]  London                              
<Document>
  <Customer>
    <Name>John</Name>
    <Phone>111111111</Phone>
    <Email>[email protected]</Email>
    <Address>Chicago</Address>
    <Address1>Illinois</Address1>
    <Address2>Phoenix</Address2>
    <DateofBirth />
  </Customer>
  <Customer>
    <Name>Mark</Name>
    <Phone>222222222</Phone>
    <Email>[email protected]</Email>
    <Address>London</Address>
    <DateofBirth />
  </Customer>
</Document>

Upvotes: 0

Andrej Kesely
Andrej Kesely

Reputation: 195613

Try:

def convert(row):
    row = row[row.notna()]

    out = ["\t<Customer>"]
    for c in row.index:
        out.append(f"\t\t<{c.capitalize()}>{row[c]}</{c.capitalize()}>")

    return "\n".join(out + ["\t</Customer>"])


out = "<Document>\n" + "\n".join(df.apply(convert, axis=1)) + "\n</Document>"
print(out)

Prints:

<Document>
        <Customer>
                <Name>John</Name>
                <Phone>111111111</Phone>
                <Email>[email protected]</Email>
                <Address>Chicago</Address>
                <Address1>Illinois</Address1>
                <Address2>Phoenix</Address2>
        </Customer>
        <Customer>
                <Name>Mark</Name>
                <Phone>222222222</Phone>
                <Email>[email protected]</Email>
                <Address>London</Address>
        </Customer>
</Document>

Initial dataframe:

   name      phone           email  address  address1 address2  dateofbirth
0  John  111111111  [email protected]  Chicago  Illinois  Phoenix          NaN
1  Mark  222222222  [email protected]   London       NaN      NaN          NaN

Upvotes: 1

Related Questions