Reputation: 47
I want convert a csv file to xml file with python. I want to group the same id's in the csv file together and convert the csv in to convert xml( see desired ouput ). Its a bit complex than it looks with indentation, looping and grouping the csv to xml. All help is appreciated.
My CSV file:
id,x1,y1,z1,x2,y2,z2,c1,R
a1,1.3,2.1,3.6,4.5,5.1,6.8,B,7.3
b2,1.1,2.1,3.1,4.1,5.1,6.1,G,7.1
c1,2.1,3.1,4.1,5.1,2.1,7.1,G,8.1
a1,2.2,3.2,4.2,5.2,6.2,7.2,S,8.2
b2,4.1,5.1,2.1,7.1,8.1,9.1,S,2.5
b2,3.6,4.5,5.1,6.3,7.4,8.2,G,3.1
c2,6.1,7.1,8.1,9.1,2.1,11.1,S,3.2
c1,1.5,1.5,1.5,1.5,1.5,1.5,A,1.5
my code:
import itertools
import csv
import os
csvFile = r'C:\Users\Desktop\test XML\csvfile.csv'
xmlFile = r'C:\Users\Desktop\test XML\myData.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="UTF-8"?>' + "\n" +'<Roughness-Profiles xmlns="http://WKI/Roughness-Profiles/1">' + "\n" )
xmlData.write(' '+'<Roughness-Profile>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', '_')
else:
xmlData.write(' '+'<surfaces>' +"\n"+' '+'<surface>' + "\n")
for i in range (len(tags)):
xmlData.write(' ' +'<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
xmlData.write(' '+'</surface>' + "\n" + ' '+'</surfaces>' + "\n" + ' '+'</Roughness-Profile>' + "\n")
rowNum +=1
xmlData.write('</Roughness-Profiles>' + "\n")
xmlData.close()
my xml output:
<?xml version="1.0" encoding="UTF-8"?>
<Roughness-Profiles xmlns="http://WKI/Roughness-Profiles/1">
<Roughness-Profile>
<surfaces>
<surface>
<id>a1</id>
<x1>1.3</x1>
<y1>2.1</y1>
<z1>3.6</z1>
<x2>4.5</x2>
<y2>5.1</y2>
<z2>6.8</z2>
<c1>B</c1>
<R>7.3</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>b2</id>
<x1>1.1</x1>
<y1>2.1</y1>
<z1>3.1</z1>
<x2>4.1</x2>
<y2>5.1</y2>
<z2>6.1</z2>
<c1>G</c1>
<R>7.1</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>c1</id>
<x1>2.1</x1>
<y1>3.1</y1>
<z1>4.1</z1>
<x2>5.1</x2>
<y2>2.1</y2>
<z2>7.1</z2>
<c1>G</c1>
<R>8.1</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>a1</id>
<x1>2.2</x1>
<y1>3.2</y1>
<z1>4.2</z1>
<x2>5.2</x2>
<y2>6.2</y2>
<z2>7.2</z2>
<c1>S</c1>
<R>8.2</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>b2</id>
<x1>4.1</x1>
<y1>5.1</y1>
<z1>2.1</z1>
<x2>7.1</x2>
<y2>8.1</y2>
<z2>9.1</z2>
<c1>S</c1>
<R>2.5</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>b2</id>
<x1>3.6</x1>
<y1>4.5</y1>
<z1>5.1</z1>
<x2>6.3</x2>
<y2>7.4</y2>
<z2>8.2</z2>
<c1>G</c1>
<R>3.1</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>c2</id>
<x1>6.1</x1>
<y1>7.1</y1>
<z1>8.1</z1>
<x2>9.1</x2>
<y2>2.1</y2>
<z2>11.1</z2>
<c1>S</c1>
<R>3.2</R>
</surface>
</surfaces>
</Roughness-Profile>
<surfaces>
<surface>
<id>c1</id>
<x1>1.5</x1>
<y1>1.5</y1>
<z1>1.5</z1>
<x2>1.5</x2>
<y2>1.5</y2>
<z2>1.5</z2>
<c1>A</c1>
<R>1.5</R>
</surface>
</surfaces>
</Roughness-Profile>
</Roughness-Profiles>
Desired output should be:
<?xml version="1.0" encoding="UTF-8"?>
<R-Profiles xmlns="http://WKI/R-Profiles/1">
<R-Profile>
<id>a1</id>
<surfaces>
<surface>
<x1>1.3</x1>
<y1>2.1</y1>
<z1>3.6</z1>
<x2>4.5</x2>
<y2>5.1</y2>
<z2>6.8</z2>
<c1>B</c1>
<R>7.3</R>
</surface>
<surface>
<x1>2.2</x1>
<y1>3.2</y1>
<z1>4.2</z1>
<x2>5.2</x2>
<y2>6.2</y2>
<z2>7.2</z2>
<c1>S</c1>
<R>8.2</R>
</surface>
</surfaces>
</R-Profile>
<R-Profile>
<id>b2</id>
<surfaces>
<surface>
<x1>1.1</x1>
<y1>2.1</y1>
<z1>3.1</z1>
<x2>4.1</x2>
<y2>5.1</y2>
<z2>6.1</z2>
<c1>G</c1>
<R>7.1</R>
</surface>
<surface>
<x1>4.1</x1>
<y1>5.1</y1>
<z1>2.1</z1>
<x2>7.1</x2>
<y2>8.1</y2>
<z2>9.1</z2>
<c1>S</c1>
<R>2.5</R>
</surface>
<surface>
<x1>3.6</x1>
<y1>4.5</y1>
<z1>5.1</z1>
<x2>6.3</x2>
<y2>7.4</y2>
<z2>8.2</z2>
<c1>G</c1>
<R>3.1</R>
</surface>
</surfaces>
</R-Profile>
<R-Profile>
<id>c1</id>
<surfaces>
<surface>
<x1>2.1</x1>
<y1>3.1</y1>
<z1>4.1</z1>
<x2>5.1</x2>
<y2>2.1</y2>
<z2>7.1</z2>
<c1>G</c1>
<R>8.1</R>
</surface>
<surface>
<x1>1.5</x1>
<y1>1.5</y1>
<z1>1.5</z1>
<x2>1.5</x2>
<y2>1.5</y2>
<z2>1.5</z2>
<c1>A</c1>
<R>1.5</R>
</surface>
</surfaces>
</R-Profile>
<R-Profile>
<id>c2</id>
<surfaces>
<surface>
<x1>6.1</x1>
<y1>7.1</y1>
<z1>8.1</z1>
<x2>9.1</x2>
<y2>2.1</y2>
<z2>11.1</z2>
<c1>S</c1>
<R>3.2</R>
</surface>
</surfaces>
</R-Profile>
</R-Profiles>
Upvotes: 1
Views: 8477
Reputation: 107767
Because XML files are not text files but special text-based documents of structured data adhering to W3C specifications, avoiding building the document by string concatenation.
Instead use appropriate DOM libraries available in virtually all modern programming languages including Python with its built-in xml.etree
or more robust, third-party module, lxml
. In fact, because your desired output involves grouping nodes by id, consider running XSLT, the special-purpose language designed to transform XML files. The module, lxml
can run XSLT 1.0 scripts.
Below uses the DictReader
of built-in csv
module to build a nested id dictionary (all columns grouped under id keys). Then, XML is built by iterating through content of this dictionary to write data to element nodes.
import csv
from collections import OrderedDict
import lxml.etree as ET
# BUILD NESTED ID DICTIONARY FROM CSV
with open("Input.csv") as f:
reader = csv.DictReader(f)
id_dct = OrderedDict({})
for dct in reader:
if dct["id"] not in id_dct.keys():
id_dct[dct["id"]] = [OrderedDict({k:v for k,v in dct.items() if k!= "id"})]
else:
id_dct[dct["id"]].append(OrderedDict({k:v for k,v in dct.items() if k!= "id"}))
# INITIALIZING XML FILE WITH ROOT AND NAMESPACE
root = ET.Element('R-Profiles', nsmap={None: "http://WKI/Roughness-Profiles/1"})
# WRITING TO XML NODES
for k,v in id_dct.items():
rpNode = ET.SubElement(root, "R-Profile")
ET.SubElement(rpNode, "id").text = str(k)
surfacesNode = ET.SubElement(rpNode, "surfaces")
for dct in v:
surfaceNode = ET.SubElement(surfacesNode, "surface")
for k,v in dct.items():
ET.SubElement(surfaceNode, k).text = str(v)
# OUTPUT XML CONTENT TO FILE
tree_out = ET.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
with open('Output.xml','wb') as f:
f.write(tree_out)
Input.csv
id,x1,y1,z1,x2,y2,z2,c1,R
a1,1.3,2.1,3.6,4.5,5.1,6.8,B,7.3
b2,1.1,2.1,3.1,4.1,5.1,6.1,G,7.1
c1,2.1,3.1,4.1,5.1,2.1,7.1,G,8.1
a1,2.2,3.2,4.2,5.2,6.2,7.2,S,8.2
b2,4.1,5.1,2.1,7.1,8.1,9.1,S,2.5
b2,3.6,4.5,5.1,6.3,7.4,8.2,G,3.1
c2,6.1,7.1,8.1,9.1,2.1,11.1,S,3.2
c1,1.5,1.5,1.5,1.5,1.5,1.5,A,1.5
Output.xml
<?xml version='1.0' encoding='UTF-8'?>
<R-Profiles xmlns="http://WKI/Roughness-Profiles/1">
<R-Profile>
<id>a1</id>
<surfaces>
<surface>
<x1>1.3</x1>
<y1>2.1</y1>
<z1>3.6</z1>
<x2>4.5</x2>
<y2>5.1</y2>
<z2>6.8</z2>
<c1>B</c1>
<R>7.3</R>
</surface>
<surface>
<x1>2.2</x1>
<y1>3.2</y1>
<z1>4.2</z1>
<x2>5.2</x2>
<y2>6.2</y2>
<z2>7.2</z2>
<c1>S</c1>
<R>8.2</R>
</surface>
</surfaces>
</R-Profile>
<R-Profile>
<id>b2</id>
<surfaces>
<surface>
<x1>1.1</x1>
<y1>2.1</y1>
<z1>3.1</z1>
<x2>4.1</x2>
<y2>5.1</y2>
<z2>6.1</z2>
<c1>G</c1>
<R>7.1</R>
</surface>
<surface>
<x1>4.1</x1>
<y1>5.1</y1>
<z1>2.1</z1>
<x2>7.1</x2>
<y2>8.1</y2>
<z2>9.1</z2>
<c1>S</c1>
<R>2.5</R>
</surface>
<surface>
<x1>3.6</x1>
<y1>4.5</y1>
<z1>5.1</z1>
<x2>6.3</x2>
<y2>7.4</y2>
<z2>8.2</z2>
<c1>G</c1>
<R>3.1</R>
</surface>
</surfaces>
</R-Profile>
<R-Profile>
<id>c1</id>
<surfaces>
<surface>
<x1>2.1</x1>
<y1>3.1</y1>
<z1>4.1</z1>
<x2>5.1</x2>
<y2>2.1</y2>
<z2>7.1</z2>
<c1>G</c1>
<R>8.1</R>
</surface>
<surface>
<x1>1.5</x1>
<y1>1.5</y1>
<z1>1.5</z1>
<x2>1.5</x2>
<y2>1.5</y2>
<z2>1.5</z2>
<c1>A</c1>
<R>1.5</R>
</surface>
</surfaces>
</R-Profile>
<R-Profile>
<id>c2</id>
<surfaces>
<surface>
<x1>6.1</x1>
<y1>7.1</y1>
<z1>8.1</z1>
<x2>9.1</x2>
<y2>2.1</y2>
<z2>11.1</z2>
<c1>S</c1>
<R>3.2</R>
</surface>
</surfaces>
</R-Profile>
</R-Profiles>
Upvotes: 0
Reputation: 52888
I would do something very similar to what @Parfait suggested; use csv.DictReader and lxml to create the XML.
However, something is missing from that answer; the surface
elements aren't grouped by id
.
If I need to group XML during a transformation, the first thing I think of is XSLT.
Once you get the hang of it, grouping is easy with XSLT; especially 2.0 or greater. Unfortunately lxml only supports XSLT 1.0. In 1.0 you need to use Muenchian Grouping.
Here's a full example of creating an intermediate XML and transforming it with XSLT.
CSV Input (test.csv)
id,x1,y1,z1,x2,y2,z2,c1,R
a1,1.3,2.1,3.6,4.5,5.1,6.8,B,7.3
b2,1.1,2.1,3.1,4.1,5.1,6.1,G,7.1
c1,2.1,3.1,4.1,5.1,2.1,7.1,G,8.1
a1,2.2,3.2,4.2,5.2,6.2,7.2,S,8.2
b2,4.1,5.1,2.1,7.1,8.1,9.1,S,2.5
b2,3.6,4.5,5.1,6.3,7.4,8.2,G,3.1
c2,6.1,7.1,8.1,9.1,2.1,11.1,S,3.2
c1,1.5,1.5,1.5,1.5,1.5,1.5,A,1.5
XSLT 1.0 (test.xsl)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:rp="http://WKI/Roughness-Profiles/1">
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:key name="surface" match="rp:surface" use="rp:id"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="/*">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:for-each select="rp:surface[count(.|key('surface',rp:id)[1])=1]">
<xsl:element name="Roughness-Profile" namespace="http://WKI/Roughness-Profiles/1">
<xsl:copy-of select="rp:id"/>
<xsl:element name="surfaces" namespace="http://WKI/Roughness-Profiles/1">
<xsl:apply-templates select="key('surface',rp:id)"/>
</xsl:element>
</xsl:element>
</xsl:for-each>
</xsl:copy>
</xsl:template>
<xsl:template match="rp:id"/>
</xsl:stylesheet>
Python
import csv
import lxml.etree as etree
# INITIALIZING XML FILE WITH ROOT IN PROPER NAMESPACE
nsmap = {None: "http://WKI/Roughness-Profiles/1"}
root = etree.Element('Roughness-Profiles', nsmap=nsmap)
# READING CSV FILE
with open("test.csv") as f:
reader = csv.DictReader(f)
# WRITE INITIAL XML NODES
for row in reader:
surface_elem = etree.SubElement(root, "surface", nsmap=nsmap)
for elem_name, elem_value in row.items():
etree.SubElement(surface_elem, elem_name.strip(), nsmap=nsmap).text = str(elem_value)
# PARSE XSLT AND CREATE TRANSFORMER
xslt_root = etree.parse("test.xsl")
transform = etree.XSLT(xslt_root)
# TRANSFORM
# (Note the weird use of tostring/fromstring. This was used so
# namespaces in the XSLT would work the way they're supposed to.)
final_xml = transform(etree.fromstring(etree.tostring(root)))
# WRITE OUTPUT TO FILE
final_xml.write_output("test.xml")
XML Output (test.xml)
<?xml version="1.0"?>
<Roughness-Profiles xmlns="http://WKI/Roughness-Profiles/1">
<Roughness-Profile>
<id>a1</id>
<surfaces>
<surface>
<x1>1.3</x1>
<y1>2.1</y1>
<z1>3.6</z1>
<x2>4.5</x2>
<y2>5.1</y2>
<z2>6.8</z2>
<c1>B</c1>
<R>7.3</R>
</surface>
<surface>
<x1>2.2</x1>
<y1>3.2</y1>
<z1>4.2</z1>
<x2>5.2</x2>
<y2>6.2</y2>
<z2>7.2</z2>
<c1>S</c1>
<R>8.2</R>
</surface>
</surfaces>
</Roughness-Profile>
<Roughness-Profile>
<id>b2</id>
<surfaces>
<surface>
<x1>1.1</x1>
<y1>2.1</y1>
<z1>3.1</z1>
<x2>4.1</x2>
<y2>5.1</y2>
<z2>6.1</z2>
<c1>G</c1>
<R>7.1</R>
</surface>
<surface>
<x1>4.1</x1>
<y1>5.1</y1>
<z1>2.1</z1>
<x2>7.1</x2>
<y2>8.1</y2>
<z2>9.1</z2>
<c1>S</c1>
<R>2.5</R>
</surface>
<surface>
<x1>3.6</x1>
<y1>4.5</y1>
<z1>5.1</z1>
<x2>6.3</x2>
<y2>7.4</y2>
<z2>8.2</z2>
<c1>G</c1>
<R>3.1</R>
</surface>
</surfaces>
</Roughness-Profile>
<Roughness-Profile>
<id>c1</id>
<surfaces>
<surface>
<x1>2.1</x1>
<y1>3.1</y1>
<z1>4.1</z1>
<x2>5.1</x2>
<y2>2.1</y2>
<z2>7.1</z2>
<c1>G</c1>
<R>8.1</R>
</surface>
<surface>
<x1>1.5</x1>
<y1>1.5</y1>
<z1>1.5</z1>
<x2>1.5</x2>
<y2>1.5</y2>
<z2>1.5</z2>
<c1>A</c1>
<R>1.5</R>
</surface>
</surfaces>
</Roughness-Profile>
<Roughness-Profile>
<id>c2</id>
<surfaces>
<surface>
<x1>6.1</x1>
<y1>7.1</y1>
<z1>8.1</z1>
<x2>9.1</x2>
<y2>2.1</y2>
<z2>11.1</z2>
<c1>S</c1>
<R>3.2</R>
</surface>
</surfaces>
</Roughness-Profile>
</Roughness-Profiles>
Upvotes: 1
Reputation: 143187
First read all rows from CSV and sort them.
Later you can use variable previous_id
to open and close Roughness-Profile
/surfaces
only when id
in new row is different then in previous one.
I used StringIO
to simulate csv file and sys.stdout
to simulate xml file - so everybody can copy code and run it to see how it works
text ='''id,x1,y1,z1,x2,y2,z2,c1,R
a1,1.3,2.1,3.6,4.5,5.1,6.8,B,7.3
b2,1.1,2.1,3.1,4.1,5.1,6.1,G,7.1
c1,2.1,3.1,4.1,5.1,2.1,7.1,G,8.1
a1,2.2,3.2,4.2,5.2,6.2,7.2,S,8.2
b2,4.1,5.1,2.1,7.1,8.1,9.1,S,2.5
b2,3.6,4.5,5.1,6.3,7.4,8.2,G,3.1
c2,6.1,7.1,8.1,9.1,2.1,11.1,S,3.2
c1,1.5,1.5,1.5,1.5,1.5,1.5,A,1.5'''
from io import StringIO
import csv
import sys
#csvFile = r'C:\Users\Desktop\test XML\csvfile.csv'
#xmlFile = r'C:\Users\Desktop\test XML\myData.xml'
#csvData = csv.reader(open(csvFile))
#xmlData = open(xmlFile, 'w')
csvData = csv.reader(StringIO(text))
xmlData = sys.stdout
# read all data to sort them
csvData = list(csvData)
tags = [item.replace(' ', '_') for item in csvData[0]] # headers
csvData = sorted(csvData[1:]) # sort data without headers
xmlData.write('<?xml version="1.0" encoding="UTF-8"?>\n<Roughness-Profiles xmlns="http://WKI/Roughness-Profiles/1">\n')
previous_id = None
for row in csvData:
row_id = row[0]
if row_id != previous_id:
# close previous group - but only if it is not first group
if previous_id is not None:
xmlData.write('</surfaces>\n</Roughness-Profile>\n')
# open new group
xmlData.write('<Roughness-Profile>\n<id>{}</id>\n<surfaces>\n'.format(row_id))
# remeber new group's id
previous_id = row_id
# surface
xmlData.write('<surface>\n')
for value, tag in zip(row[1:], tags[1:]):
xmlData.write('<{}>{}</{}>\n'.format(tag, value, tag))
xmlData.write('</surface>\n')
# close last group
xmlData.write('</surfaces>\n</Roughness-Profile>\n')
xmlData.write('</Roughness-Profiles>\n')
#xmlData.close()
Version without StringIO
and sys.stdout
import csv
csvFile = r'C:\Users\Desktop\test XML\csvfile.csv'
xmlFile = r'C:\Users\Desktop\test XML\myData.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
# read all data to sort them
csvData = list(csvData)
tags = [item.replace(' ', '_') for item in csvData[0]] # headers
csvData = sorted(csvData[1:]) # sort data without headers
xmlData.write('<?xml version="1.0" encoding="UTF-8"?>\n<Roughness-Profiles xmlns="http://WKI/Roughness-Profiles/1">\n')
previous_id = None
for row in csvData:
row_id = row[0]
if row_id != previous_id:
# close previous group - but only if it is not first group
if previous_id is not None:
xmlData.write('</surfaces>\n</Roughness-Profile>\n')
# open new group
xmlData.write('<Roughness-Profile>\n<id>{}</id>\n<surfaces>\n'.format(row_id))
# remeber new group's id
previous_id = row_id
# surface
xmlData.write('<surface>\n')
for value, tag in zip(row[1:], tags[1:]):
xmlData.write('<{}>{}</{}>\n'.format(tag, value, tag))
xmlData.write('</surface>\n')
# close last group
xmlData.write('</surfaces>\n</Roughness-Profile>\n')
xmlData.write('</Roughness-Profiles>\n')
xmlData.close()
Upvotes: 0