Reputation: 31
I have following XML file which I want to convert to CSV using Python.
<?xml version="1.0" encoding="UTF-8"?><households xmlns:s="http://www.mediametrie.fr/nge/ " xmlns:xalan="http://xml.apache.org/xalan" date="2015-04-06" creation_date="2015-04-08T03:48:34">
<household id="10003456">
<destinations/>
<members>
<member id="1">
<member_process result="KO" vacation="undefined">
<individual_audience>
<individual_audience_tvset id="1">
<channel session="5647128" begin="56435" end="76896"/>
</individual_audience_tvset>
</individual_audience>
<alarms>
<alarm id="Alarm_id_1" rule_id="Rule_id_1">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="46384"/>
<parameter name="end" value="2017-04-06T20:30:00"/>
<parameter name="channel" value="1010128"/>
</parameters>
</alarm>
</alarms>
</member_process>
</member>
<member id="2">
<member_process result="KO" vacation="undefined">
<individual_audience>
<individual_audience_tvset id="1">
<channel session="5674897" begin="98765" end="76543"/>
</individual_audience_tvset>
</individual_audience>
<alarms>
<alarm id="Alarm_id_2" rule_id="Rule_id_2">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="56745"/>
<parameter name="end" value="2017-04-06T20:30:00"/>
<parameter name="channel" value="4563256"/>
</parameters>
</alarm>
</alarms>
</member_process>
</member>
<member id="3">
<member_process result="KO" vacation="undefined">
<individual_audience>
<individual_audience_tvset id="1">
<channel session="1010128" begin="47218" end="93600"/>
</individual_audience_tvset>
</individual_audience>
<alarms>
<alarm id="AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT" rule_id="R_INDP_AUDIENCE_TOO_HIGH_LIMIT">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="46382"/>
<parameter name="end" value="2015-04-06T20:30:00"/>
<parameter name="channel" value="1010128"/>
</parameters>
</alarm>
</alarms>
</member_process>
</member>
<member id="4">
<member_process result="KO" vacation="undefined">
<individual_audience>
<individual_audience_tvset id="1">
<channel session="1010128" begin="47219" end="93600"/>
</individual_audience_tvset>
</individual_audience>
<alarms>
<alarm id="AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT" rule_id="R_INDP_AUDIENCE_TOO_HIGH_LIMIT">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="46381"/>
<parameter name="end" value="2015-04-06T20:30:00"/>
<parameter name="channel" value="1010128"/>
</parameters>
</alarm>
</alarms>
</member_process>
</member>
<member id="5">
<member_process result="KO" vacation="undefined">
<individual_audience>
<individual_audience_tvset id="1">
<channel session="1010128" begin="47220" end="93600"/>
</individual_audience_tvset>
</individual_audience>
<alarms>
<alarm id="AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT" rule_id="R_INDP_AUDIENCE_TOO_HIGH_LIMIT">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="46380"/>
<parameter name="end" value="2015-04-06T20:30:00"/>
<parameter name="channel" value="1010128"/>
</parameters>
</alarm>
</alarms>
</member_process>
</member>
<member id="6">
<member_process result="KO" vacation="undefined">
<individual_audience>
<individual_audience_tvset id="1">
<channel session="1010128" begin="47221" end="93600"/>
</individual_audience_tvset>
</individual_audience>
<alarms>
<alarm id="AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT" rule_id="R_INDP_AUDIENCE_TOO_HIGH_LIMIT">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="46379"/>
<parameter name="end" value="2015-04-06T20:30:00"/>
<parameter name="channel" value="1010128"/>
</parameters>
</alarm>
</alarms>
</member_process>
</member>
</members>
<regular_guests/>
<occasional_guests/>
<tvsets>
<tvset id="1">
<tvset_process result="OK">
<tvset_audience>
<channel session="47" begin="46304" end="46384"/>
<channel session="1010483" begin="46384" end="46419"/>
<channel session="235" begin="46419" end="46424"/>
<channel session="1010128" begin="46424" end="93600"/>
</tvset_audience>
<alarms>
<alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
<parameters>
<parameter name="unique_id" value="4547"/>
<parameter name="reboot_date" value="2015-04-06T07:17:44"/>
<parameter name="length" value="1.6221180555555557"/>
</parameters>
</alarm>
<alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
<parameters>
<parameter name="unique_id" value="4566"/>
<parameter name="reboot_date" value="2015-04-07T13:17:54"/>
<parameter name="length" value="1.2313657407407406"/>
</parameters>
</alarm>
<alarm id="AL_T_P_TECH_ID_RESOL_FALSE_POSITIVE" rule_id="T_P_TECH_ID_RESOL">
<parameters>
<parameter name="channel_id" value="194"/>
<parameter name="unique_id" value="4549"/>
</parameters>
</alarm>
</alarms>
</tvset_process>
</tvset>
</tvsets>
<household_process result="KO" vacation="no">
<alarms>
<alarm id="AL_T_FP_AUDIENCE_WITHOUT_PRESENCE" rule_id="T_FP_AUDIENCE_WITHOUT_PRESENCE">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="80"/>
<parameter name="start" value="2015-04-06T07:21:44"/>
</parameters>
</alarm>
<alarm id="AL_T_FP_AUDIENCE_WITHOUT_PRESENCE" rule_id="T_FP_AUDIENCE_WITHOUT_PRESENCE">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="792"/>
<parameter name="start" value="2015-04-06T07:23:44"/>
</parameters>
</alarm>
<alarm id="AL_R_FP_AUDIENCE_TOO_HIGH_LIMIT" rule_id="R_FP_AUDIENCE_TOO_HIGH_LIMIT">
<parameters>
<parameter name="tvset_id" value="1"/>
<parameter name="length" value="47176"/>
<parameter name="end" value="2015-04-06T20:30:00"/>
<parameter name="channel" value="1010128"/>
</parameters>
</alarm>
<alarm id="AL_R_FP_AT_LEAST_ONE_MEMBER_OK" rule_id="R_FP_AT_LEAST_ONE_MEMBER_OK">
<parameters/>
</alarm>
</alarms>
</household_process>
</household>
</households>
Output should look something like this
household id, destinations, member id, result, vacation, individual_audience_tvset id, session, begin, end, alarm id, rule_id, name, value
10003456, None, 1, KO, undefined, 1, 5647128, 56435, 76896, Alarm_id_1, Rule_id_1, tvset_id, 1
10003456, None, 1, KO, undefined, 1, 5647128, 56435, 76896, Alarm_id_1, Rule_id_1, length, 46384
10003456, None, 1, KO, undefined, 1, 5647128, 56435, 76896, Alarm_id_1, Rule_id_1, end, 2017-04-06T20:30:00
10003456, None, 1, KO, undefined, 1, 5647128, 56435, 76896, Alarm_id_1, Rule_id_1, channel, 1010128
Similarly for member id = 2 with same household id.
Any help is highly appreciated. Thank you in advance!
Upvotes: 0
Views: 2352
Reputation: 46759
This assumes you have your XML in a file called input.xml
. BeautifulSoup can be used to help with parsing the XML read from the file. You then just need to create a table containing all the information that you wish to extract:
from bs4 import BeautifulSoup
import csv
fields = [
"household id",
"destinations",
"member id"]
member_fields = [
["result", "member_process", "result"],
["vacation", "member_process", "vacation"],
["individual_audience_tvset id", "individual_audience_tvset", "id"],
["session", "channel", "session"],
["begin", "channel", "begin"],
["end", "channel", "end"],
["alarm id", "alarm", "id"],
["rule_id", "alarm", "rule_id"],
["name", "parameter", "name"],
["value", "parameter", "value"]
]
fieldnames = fields + [field for field, _, _ in member_fields]
with open('input.xml') as f_input, open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=fieldnames)
csv_output.writeheader()
xml = f_input.read()
soup = BeautifulSoup(xml, "xml")
household_id = soup.find('household')['id']
for member in soup.find_all('member'):
member_id = member['id']
row = {'household id' : household_id, 'member id' : member_id}
for field, x, y in member_fields:
row[field] = member.find(x)[y]
csv_output.writerow(row)
Which would create output.csv
containing:
household id,destinations,member id,result,vacation,individual_audience_tvset id,session,begin,end,alarm id,rule_id,name,value
10003456,,1,KO,undefined,1,5647128,56435,76896,Alarm_id_1,Rule_id_1,tvset_id,1
10003456,,2,KO,undefined,1,5674897,98765,76543,Alarm_id_2,Rule_id_2,tvset_id,1
10003456,,3,KO,undefined,1,1010128,47218,93600,AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT,R_INDP_AUDIENCE_TOO_HIGH_LIMIT,tvset_id,1
10003456,,4,KO,undefined,1,1010128,47219,93600,AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT,R_INDP_AUDIENCE_TOO_HIGH_LIMIT,tvset_id,1
10003456,,5,KO,undefined,1,1010128,47220,93600,AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT,R_INDP_AUDIENCE_TOO_HIGH_LIMIT,tvset_id,1
10003456,,6,KO,undefined,1,1010128,47221,93600,AL_R_INDP_AUDIENCE_TOO_HIGH_LIMIT,R_INDP_AUDIENCE_TOO_HIGH_LIMIT,tvset_id,1
Upvotes: 2