Reputation: 316
I have an xml dataset in the below format.
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE population SYSTEM "http://www.matsim.org/files/dtd/population_v5.dtd">
<population>
<!-- ====================================================================== -->
<person id="10000061">
<plan score="219.62581874242716" selected="yes">
<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
<leg mode="ride" dep_time="15:07:00" trav_time="00:03:27" arr_time="15:10:27">
<route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
</leg>
<act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
<leg mode="car" dep_time="16:21:00" trav_time="00:09:44" arr_time="16:30:44">
<route type="links">13904 21207 21208 13980 21187 21188 14148 14144 14130 14129</route>
</leg>
<act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
<leg mode="ride" dep_time="17:37:00" trav_time="00:09:46" arr_time="17:46:46">
<route type="links">14129 14143 14147 14161 14171 14189 14195 14120 14106 14051 13941 13938 13976 14044 21259 21258</route>
</leg>
<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
</plan>
<plan score="218.9756035020247" selected="no">
<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
<leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26">
<route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
</leg>
<act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
<leg mode="car" dep_time="16:21:00" trav_time="00:08:46" arr_time="16:29:46">
<route type="links">13904 13905 13891 13855 21239 21240 13887 13885 13869 13870 13920 13974 14070 14075 14103 14109 14123 14129</route>
</leg>
<act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
<leg mode="ride" dep_time="17:37:00" trav_time="00:11:06" arr_time="17:48:06">
<route type="links">14129 14143 14147 14161 14150 14098 14094 14095 14113 14106 14051 13941 13938 13976 14044 21259 21258</route>
</leg>
<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
</plan>
<plan score="218.5148700010285" selected="no">
<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
<leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26">
<route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
</leg>
<act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
<leg mode="car" dep_time="16:21:00" trav_time="00:08:15" arr_time="16:29:15">
<route type="links">13904 13905 13906 13980 21187 21188 14148 14144 14130 14129</route>
</leg>
<act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
<leg mode="ride" dep_time="17:37:00" trav_time="00:11:18" arr_time="17:48:18">
<route type="links">14129 14130 14124 14110 14104 14077 14071 13975 13921 13871 13868 13884 13886 13888 13894 13904 13918 13924 13938 13976 14044 21259 21258</route>
</leg>
<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
</plan>
</person>
<!-- ====================================================================== -->
<person id="10000302">
<plan score="209.66504470021556" selected="yes">
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
<leg mode="car" dep_time="07:56:00" trav_time="00:03:00" arr_time="07:59:00">
<route type="links">21256 13966 14056 14057</route>
</leg>
<act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
<leg mode="car" dep_time="10:28:00" trav_time="00:08:20" arr_time="10:36:20">
<route type="links">14057 14049 14045 13977 13939 13925 13919 21207 21208 13980 14046 14095 21191</route>
</leg>
<act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
<leg mode="car" dep_time="11:52:00" trav_time="00:08:33" arr_time="12:00:33">
<route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
</leg>
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
<leg mode="car" dep_time="12:11:00" trav_time="00:06:35" arr_time="12:17:35">
<route type="links">21256 21257 21258 14045 13977 13939 13925 13919 13905 13906</route>
</leg>
<act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
<leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30">
<route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
</leg>
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
</plan>
<plan score="205.5456839457717" selected="no">
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
<leg mode="car" dep_time="07:56:00" trav_time="00:02:15" arr_time="07:58:15">
<route type="links">21256 13966 14056 14057</route>
</leg>
<act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
<leg mode="car" dep_time="10:28:00" trav_time="00:06:51" arr_time="10:34:51">
<route type="links">14057 14056 14177 14191 14214 14247 14235 14213 14211 14198 14120 14114 21191</route>
</leg>
<act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
<leg mode="car" dep_time="11:52:00" trav_time="00:07:45" arr_time="11:59:45">
<route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
</leg>
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
<leg mode="car" dep_time="12:11:00" trav_time="00:07:51" arr_time="12:18:51">
<route type="links">21256 13915 13823 13767 13743 13731 13732 13837 13831 13819 13820 13854 13890 13906</route>
</leg>
<act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
<leg mode="car" dep_time="13:30:00" trav_time="00:08:54" arr_time="13:38:54">
<route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
</leg>
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
</plan>
<plan score="203.4205865037132" selected="no">
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
<leg mode="car" dep_time="07:56:00" trav_time="00:03:15" arr_time="07:59:15">
<route type="links">21256 13966 14056 14057</route>
</leg>
<act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
<leg mode="car" dep_time="10:28:00" trav_time="00:06:41" arr_time="10:34:41">
<route type="links">14057 14049 14045 13977 13939 13940 14050 14105 14114 21191</route>
</leg>
<act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
<leg mode="car" dep_time="11:52:00" trav_time="00:09:12" arr_time="12:01:12">
<route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
</leg>
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
<leg mode="car" dep_time="12:11:00" trav_time="00:05:10" arr_time="12:16:10">
<route type="links">21256 13966 14049 14045 13977 13939 13925 13919 13905 13906</route>
</leg>
<act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
<leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30">
<route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
</leg>
<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
</plan>
</person>
<!-- ====================================================================== -->
</population>
From this XML data, I am trying to generate a dataframe with the following structure.
person score selected act.typ act.x act.y act_start act_end leg.mod leg_dep leg_trav leg_arr
10000061 219.6258 yes home 334867.2 3126571 3:00:00 15:07:00 ride 15:07:00 0:03:27 15:10:27
10000061 219.6258 yes shop 332634.9 3127079 15:12:00 16:21:00 car 16:21:00 0:09:44 16:30:44
10000061 219.6258 yes shop 331666.4 3129306 16:25:00 17:37:00 ride 17:37:00 0:09:46 17:46:46
10000061 219.6258 yes home 334867.2 3126571 17:45:00 26:59:00 NA NA NA NA
10000302 209.665 yes home 334598.4 3126269 3:00:00 7:56:00 car 7:56:00 0:03:00 7:59:00
10000302 209.665 yes work 335957.1 3128105 8:04:00 10:28:00 car 10:28:00 0:08:20 10:36:20
10000302 209.665 yes social 333032.8 3128760 10:33:00 11:52:00 car 11:52:00 0:08:33 12:00:33
10000302 209.665 yes home 334598.4 3126269 11:59:00 12:11:00 car 12:11:00 0:06:35 12:17:35
10000302 209.665 yes social 332302.2 3127536 12:17:00 13:30:00 car 13:30:00 0:05:30 13:35:30
10000302 209.665 yes home 334598.4 3126269 13:36:00 26:59:00 NA NA NA NA
I have looked at the XML library and various functions like XpathApply
and xmlGetAttr
and was able to retrieve individual lists and dataframes at each node level. However, as you can see, I need to connect these different subsets of data into one single dataframe by maintaining the parent-child relation. Additionally, I only want to select those parent and child attributes where the attribute selected value is "yes". I also do not want the values of the final node i.e. route. I have pasted only two individual's information here. The actual data has information on 3 million individuals.
Upvotes: 0
Views: 104
Reputation: 107567
Whenever dealing with complex XML files, consider an XSLT solution. XSLT is a declarative, special-purpose language (same type as SQL) that is designed to transform XML documents. Because datasets require the two dimensions of rows and columns, XSLT can transform nested nodes and attributes to meet this structure.
And while R does not have a universal XSLT library, R can call external programs using system()
to process XSLT transformations. Below is the open-source Python example but practically any general purpose (Java, C#, PHP, Perl, even Excel VBA) language, dedicated executable processors (Xalan and Saxon), or command line programs (PowerShell, Bash) can process XSLT.
XSLT Script (save as .xsl or .xslt)
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output version="1.0" encoding="UTF-8" indent="yes" />
<xsl:strip-space elements="*"/>
<xsl:template match="/">
<data>
<xsl:apply-templates select="*"/>
</data>
</xsl:template>
<xsl:template match="person|plan">
<xsl:apply-templates />
</xsl:template>
<xsl:template match="act">
<person>
<id><xsl:value-of select="ancestor::person/@id"/></id>
<score><xsl:value-of select="ancestor::plan/@score"/></score>
<selected><xsl:value-of select="ancestor::plan/@selected"/></selected>
<type><xsl:value-of select="@type"/></type>
<link><xsl:value-of select="@link"/></link>
<x><xsl:value-of select="@x"/></x>
<y><xsl:value-of select="@y"/></y>
<start_time><xsl:value-of select="@start_time"/></start_time>
<end_time><xsl:value-of select="@end_time"/></end_time>
<mode><xsl:value-of select="following-sibling::leg/@mode"/></mode>
<dep_time><xsl:value-of select="following-sibling::leg/@dep_time"/></dep_time>
<trav_time><xsl:value-of select="following-sibling::leg/@trav_time"/></trav_time>
<arr_time><xsl:value-of select="following-sibling::leg/@arr_time"/></arr_time>
</person>
</xsl:template>
<xsl:template match="route"/>
</xsl:transform>
Python Script (using lxml module, parsing above script)
import lxml.etree as ET
dom = ET.parse('Input.xml'))
xslt = ET.parse('XSLTScript.xsl'))
transform = ET.XSLT(xslt)
newdom = transform(dom)
tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
xmlfile = open('Output.xml','wb')
xmlfile.write(tree_out)
xmlfile.close()
XML Transformed Output
<?xml version='1.0' encoding='UTF-8'?>
<data>
<person>
<id>10000061</id>
<score>219.62581874242716</score>
<selected>yes</selected>
<type>home</type>
<link>21258</link>
<x>334867.243653</x>
<y>3126570.70778</y>
<start_time>03:00:00</start_time>
<end_time>15:07:00</end_time>
<mode>ride</mode>
<dep_time>15:07:00</dep_time>
<trav_time>00:03:27</trav_time>
<arr_time>15:10:27</arr_time>
</person>
<person>
<id>10000061</id>
<score>219.62581874242716</score>
<selected>yes</selected>
<type>shop</type>
<link>13904</link>
<x>332634.86999</x>
<y>3127078.96383</y>
<start_time>15:12:00</start_time>
<end_time>16:21:00</end_time>
<mode>car</mode>
<dep_time>16:21:00</dep_time>
<trav_time>00:09:44</trav_time>
<arr_time>16:30:44</arr_time>
</person>
<person>
<id>10000061</id>
<score>219.62581874242716</score>
<selected>yes</selected>
<type>shop</type>
<link>14129</link>
<x>331666.364904</x>
<y>3129306.48785</y>
<start_time>16:25:00</start_time>
<end_time>17:37:00</end_time>
<mode>ride</mode>
<dep_time>17:37:00</dep_time>
<trav_time>00:09:46</trav_time>
<arr_time>17:46:46</arr_time>
</person>
...
R Script (calling above Python script at command line)
library(XML)
system('python "C:/Path/To/Python/Script.py"') # TRANSFORMS INPUT
plans <- xmlParse('C:/Path/To/Transformed/Output.xml') # PARSES OUTPUT
df <- xmlToDataFrame(nodes = getNodeSet(plans, "//person")) # MIGRATES OUTPUT TO DF
Upvotes: 1
Reputation: 316
Since posting this question, I did some research and came across this post and used it to come up with an answer. Below is my code. I am sure, this could be done in a much efficient way. As such, I will not accept this as the final answer until someone takes a look and approves of it or provides a better alternative.
library(XML)
file1 <- "C:/Users/s/Desktop/plans.xml"
plans <- xmlParse(file1)
idNodes <- getNodeSet(plans, "//person[@id]")
ids <- lapply(idNodes, function(x) xmlAttrs(x)['id'])
attribact <- vector("list", length(ids)*14)
k <- 1
for (i in 1:length(ids))
{
act <- xpathApply(idNodes[[i]], path = paste("//person[@id=", as.numeric(ids[[i]]),"]//plan[@selected='yes']//act|leg", sep=""), xmlAttrs)
for (j in 1:length(act))
{
attribact[[k]] <- c(act[[j]],ids[[i]])
k <- k+1
}
}
attribact <- attribact[attribact!='NULL']
k1 <- t(as.data.frame(attribact))
attribleg <- vector("list", length(ids)*10)
k <- 1
for (i in 1:length(ids))
{
leg <- xpathApply(idNodes[[i]], path = paste("//person[@id=", as.numeric(ids[[i]]),"]//plan[@selected='yes']//leg", sep=""), xmlAttrs)
leg[[length(leg)+1]] <- c(NA,NA,NA,NA)
for (j in 1:length(leg))
{
attribleg[[k]] <- c(leg[[j]],ids[[i]])
k <- k+1
}
}
attribleg <- attribleg[attribleg!='NULL']
k2 <- t(as.data.frame(attribleg))
data1 <- cbind(k1,k2)
Upvotes: 0