Reputation: 65
I am in the process of stripping a couple million XMLs of sensitive data. How can I add a try and except to get around this error which seems to have occurred because a couple of malformed xmls out to the bunch.
xml.parsers.expat.ExpatError: mismatched tag: line 1, column 28691
#!/usr/bin/python
import sys
from xml.dom import minidom
def getCleanString(word):
str = ""
dummy = 0
for character in word:
try:
character = character.encode('utf-8')
str = str + character
except:
dummy += 1
return str
def parsedelete(content):
dom = minidom.parseString(content)
for element in dom.getElementsByTagName('RI_RI51_ChPtIncAcctNumber'):
parentNode = element.parentNode
parentNode.removeChild(element)
return dom.toxml()
for line in sys.stdin:
if line > 1:
line = line.strip()
line = line.split(',', 2)
if len(line) > 2:
partition = line[0]
id = line[1]
xml = line[2]
xml = getCleanString(xml)
xml = parsedelete(xml)
strng = '%s\t%s\t%s' %(partition, id, xml)
sys.stdout.write(strng + '\n')
Upvotes: 1
Views: 3084
Reputation: 77367
Catching exceptions is straight forward. Add import xml
to your import statements and wrap the problem code in a try/except handler.
def parsedelete(content):
try:
dom = minidom.parseString(content)
except xml.parsers.expat.ExpatError, e:
# not sure how you want to handle the error... so just passing back as string
return str(e)
for element in dom.getElementsByTagName('RI_RI51_ChPtIncAcctNumber'):
parentNode = element.parentNode
parentNode.removeChild(element)
return dom.toxml()
Upvotes: 2