Reputation: 25
I want to change all tags names <p>
to <PARAGRAPH>
using lxml in python. The problem is I'm able to change the outer concept tag's <p>
tag and the first inner concept <p>
tag, but I don't know how to change all the remaining <p>
tags.
Here's an example of what the xml file looks like.
<dita>
<topic id="id15CDB0PL09E">
<title id="id15CDB0R0VYB"><?FM MARKER [Header/Footer $1] All?>Control
</title>
<shortdesc>CONTROL</shortdesc>
<concept id="id15CDB0Q0Q4G">
<title id="id15CDB0R0VHA">General
</title>
<conbody>
<p>This section
</p>
</conbody>
<concept id="id156F7H00GIE">
<title id="id15CDB0R0V1W">System
</title>
<conbody>
<p>Engine
</p>
<p>The ECU
</p>
<p>The aircraft
</p>
<p>The system
</p>
</conbody>
</concept>
</concept>
</topic>
</dita>
Here's my python code using lxml.
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
from lxml import etree
doc = etree.parse("73-20.xml")
#Lista para guardar las tag que son hijas de la tag concept
conceptDentro=[]
conbodysFuera=[]
conbodysDentro=[]
#Consigue la root tag (en este caso es dita y su unico hijo es topic)
raiz = doc.getroot()
print(raiz.tag)
print("\n")
#Encuentra todos los tag concept que son hijos de la tag topic
conceptAfuera = raiz[0].findall("concept")
print(conceptAfuera)
print("\n")
#Con este ciclo encuentra las tag concept que son hijas de la tags conceptAfuera
for i in range(len(conceptAfuera)):
#conbodysFuera son los tag conbody que son hijos de los conceptAfuera, en este caso solo hay 1
conbodysFuera.extend(conceptAfuera[i].findall("conbody"))
#conceptDentro son las tags concept dentro de las tag conceptAfuera
conceptDentro.extend(conceptAfuera[i].findall("concept"))
#ConceptDentro es una lista que almacena todas las tag concept dentro de las tag conceptAfuera
print("Etiquetas de conceptDentro: ",conceptDentro)
print("\n")
print("Etiquetas de conbodysFuera: ",conbodysFuera)
print("\n")
#Ciclo para encontrar todas las tag conbody dentro de los conceptDentro
for i in range(len(conceptDentro)):
conbodysDentro.extend(conceptDentro[i].findall("conbody"))
print("Etiquetas de conbodysDentro: ", conbodysDentro)
print("\n")
#Ciclo para cambiar la tag p de conbodysFuera
for i in range(len(conbodysFuera)):
for j in range(len(conbodysFuera[i].findall("p"))):
conbodysFuera[i].findall("p")[j].tag="PARAGRAPH"
#Ciclo para cambiar la tag p de conbodysDentro
for i in range(len(conbodysDentro)):
#for j in range(len(conbodysDentro[i].findall("p"))):
conbodysDentro[i].findall("p")[j].tag="PARAGRAPH"
#print(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding="utf-8"))
doc.write("FerNewtags.xml")
As you can see, the last for Ciclo is written as a comment because it causes an IndexError: list index out of range
.
Any thoughts?
Upvotes: 0
Views: 155
Reputation: 52858
I think you're overcomplicating it. Just find all of the p
elements (with .xpath()
or .findall()
) and change the value of the .tag property...
from lxml import etree
tree = etree.parse("73-20.xml")
for p in tree.findall(".//p"):
p.tag = "paragraph"
tree.write("FerNewtags.xml")
Output (FerNewtags.xml)
<dita>
<topic id="id15CDB0PL09E">
<title id="id15CDB0R0VYB"><?FM MARKER [Header/Footer $1] All?>Control
</title>
<shortdesc>CONTROL</shortdesc>
<concept id="id15CDB0Q0Q4G">
<title id="id15CDB0R0VHA">General
</title>
<conbody>
<paragraph>This section
</paragraph>
</conbody>
<concept id="id156F7H00GIE">
<title id="id15CDB0R0V1W">System
</title>
<conbody>
<paragraph>Engine
</paragraph>
<paragraph>The ECU
</paragraph>
<paragraph>The aircraft
</paragraph>
<paragraph>The system
</paragraph>
</conbody>
</concept>
</concept>
</topic>
</dita>
Upvotes: 1