Reputation: 529
I have a xml file like this which is a series in the following DF
userid | fid | response
-----------------------
1125 | 58940 | xml1
3344 | 47839 | xml2
3455 | 12335 | xml3
The response column contains xml files like this
HTTP/1.1 100 Continue
HTTP/1.1 200 OK
Expires: 0
Buffer: false
Pragma: No-cache
Cache-Control: no-cache
Server: IBM_CICS_Transaction_Server/4.1.0(zOS)
Connection: close
Content-Type: text/html
Content-Length: 33842
Date: Sat, 02 Aug 2014 09:27:02 GMT
<?xml version="1.0" encoding="UTF-8"?><creditBureau xmlns="http://www.transunion.com/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber>
This is just a part of the entire document. I have to parse this big xml and convert it into json. The first problem I am having is to parse this file. My current code looks like this:
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[12] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
### Gives an error:XML or text declaration not at start of entity: line 1, column 6
I don't understand what is the problem here.
Upvotes: 1
Views: 1178
Reputation: 5372
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file_list = file.split('\n')
file_list[13] = '<root>' ### 12 to 13 to overwrite deformed <creditBureau ...>
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
Couple of issues:
<root>
tag. Removed declarations.""
which deforms them. Works without the declarations.If you want to use the regex I offered in a previous post then try this as it will remove the header without any line count needed. <root>
then stats from index 1.
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
file_list = file.split('\n')
file_list[1] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]
#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
More variations in the xml files may need adjustment to the code.
import pandas as pd
import re
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()
#Adding Root Element
with open("testfile.txt") as f:
file = f.read()
# Replace up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
# Strip file and add \n at each instance of >.
file = file.strip()
file = file.replace('>', '>\n')
# Split file and make a list with no empty items.
file_list = file.split('\n')
file_list = [item for item in file_list if item != '']
# Remove known xml declarations.
if file_list[0][:5] == '<?xml':
del file_list[0]
if file_list[0][:13] == '<creditBureau':
del file_list[0]
# Add root tags.
file_list.insert(0, '<root>')
file_list.append('</root>')
#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
f.write(str1)
#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
Output:
<root><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber></transactionControl></root>
Edit: If <creditBureau...>
is needed, then remove or comment these lines:
if file_list[0][:13] == '<creditBureau':
del file_list[0]
The last xml posted does not look malformed with the initial tags so no changes done to handle it.
Using regex with no use of lists to handle the xml string as the xml could be multiline or single line.
In the customize_xml function, regex comments show group numbers and the
pattern modes which you can pass as arguments to the customize_xml function.
Valid mode arguments are one of None, -1, 0, 1, 2, 3, 4
.
import pandas as pd
import re
def customize_xml(content, mode=0):
'''Customizes xml tags in the content and may insert a <root> tag.'''
# No modification.
if mode in (-1, None):
return content
# Select a pattern (mode) that modifies the whole xml.
pattern = (r'\2\3\4', # 0. <cB>...</cB>
r'<root>\2\3\4</root>', # 1. <root><cB>...</cB><root>
r'\1<root>\2\3\4</root>', # 2. <?xml?><root><cB>...</cB><root>
r'<root>\3</root>', # 3. <root>...<root>
r'\1<root>\3</root>', # 4. <?xml?><root>...<root>
)
# Groups are marked as \1 \2 ... to use for pattern above.
content = re.sub(r'(<\?xml.+?\?>)' # \1
'(<creditBureau.*?>)' # \2
'(.+?)' # \3
'(</creditBureau>)' # \4
, pattern[mode], content, flags=re.S)
return content
raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
with open("testfile.txt", "w") as f:
f.write(df.loc[0][2])
with open("testfile.txt") as f:
file = f.read()
# Remove characters up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
# Make xml one single line if not already.
file = file.replace('\n', '')
file = customize_xml(file, 3)
# Write customized xml.
with open("tocng.xml", "w") as f:
f.write(file)
# Parsing xml.
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')
# Print pretty xml from xml string.
from xml.dom import minidom
pretty_xml = minidom.parseString(file).toprettyxml(indent=" ")
print(pretty_xml)
Added a pretty print at end. Optional, just for reviewing the result.
Upvotes: 1