Reputation: 331
I have xml like this:
<root>
<cards>
<meeting name="Punchestown (IRE)" id="195" diffusion_course_name="PUNCHESTOWN">
<race id="692415" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>12:25</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Adare Manor Opportunity Handicap Chase</title>
<type>C</type>
<distance>2m4f</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>10</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Handicap Chase</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692416" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>1:00</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Total Event Rental (Kildare) Novice Chase (Grade 3)</title>
<type>C</type>
<distance>2m4f</distance>
<group>Grade 3</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>7</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Novice Chase Grade 3</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692417" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>1:35</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Connolly's RED MILLS Amateur National (Q.R.) Handicap Chase</title>
<type>C</type>
<distance>3m1f</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>12</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Handicap Chase</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692418" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>2:10</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Sky Bet Moscow Flyer Novice Hurdle (Grade 2)</title>
<type>H</type>
<distance>2m</distance>
<group>Grade 2</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>7</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Novice Hurdle Grade 2</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692419" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>2:45</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Sportinglife.com Maiden Hurdle</title>
<type>H</type>
<distance>2m</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>17</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Maiden Hurdle</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692420" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>3:20</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Leinster Leader Mares Handicap Hurdle</title>
<type>H</type>
<distance>2m4f40y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>8</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Handicap Hurdle</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692421" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>3:50</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>David Trundley Artist At Punchestown Irish Stallion Farms EBF Mares Flat Race</title>
<type>B</type>
<distance>2m</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>14</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>NHF</raceDescription>
<tvText>ATR </tvText>
</race>
</meeting>
<meeting name="Warwick" id="85" diffusion_course_name="WARWICK">
<race id="691061" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>12:40</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betfred Supports Jack Berry House Novices' Handicap Hurdle</title>
<type>H</type>
<distance>2m</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>18</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Novice Handicap Hurdle</raceDescription>
<tvText>RUK </tvText>
<betOffers>
<betOffer>WH</betOffer>
</betOffers>
</race>
<race id="691060" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>1:15</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betfred Mobile Edward Courage Cup Handicap Chase</title>
<type>C</type>
<distance>2m54y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>7</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 3 Handicap Chase</raceDescription>
<tvText>RUK </tvText>
<betOffers>
<betOffer>LB</betOffer>
<betOffer>WH</betOffer>
</betOffers>
</race>
<race id="691058" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>1:50</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betfred Home Of Goals Galore Hampton Novices' Chase (Listed Race)</title>
<type>C</type>
<distance>3m</distance>
<group>Listed</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>5</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 1 Novice Chase Listed</raceDescription>
<tvText>ITV4 </tvText>
<betOffers>
<betOffer>Coral</betOffer>
</betOffers>
</race>
<race id="691059" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>2:25</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Pertemps Network Handicap Hurdle (Series Qualifier)</title>
<type>H</type>
<distance>3m1f</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>12</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 2 Handicap Hurdle</raceDescription>
<tvText>ITV4 </tvText>
<betOffers>
<betOffer>LB</betOffer>
<betOffer>WH</betOffer>
<betOffer>Coral</betOffer>
</betOffers>
</race>
<race id="691057" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>3:00</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Ballymore Leamington Novices' Hurdle (Grade 2)</title>
<type>H</type>
<distance>2m5f</distance>
<group>Grade 2</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>6</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 1 Novice Hurdle Grade 2</raceDescription>
<tvText>ITV4 </tvText>
<betOffers>
<betOffer>WH</betOffer>
<betOffer>Coral</betOffer>
</betOffers>
</race>
<race id="691056" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>3:35</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betfred Classic Handicap Chase (Grade 3)</title>
<type>C</type>
<distance>3m5f54y</distance>
<group>Grade 3 Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>15</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 1 Handicap Chase Grade 3</raceDescription>
<tvText>ITV4 </tvText>
<betOffers>
<betOffer>LB</betOffer>
<betOffer>WH</betOffer>
<betOffer>Coral</betOffer>
<betOffer>PP</betOffer>
</betOffers>
</race>
<race id="691062" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>4:05</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betfred TV "Newcomers" Standard Open National Hunt Flat Race</title>
<type>B</type>
<distance>2m</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>9</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 5 NHF</raceDescription>
<tvText>RUK </tvText>
<betOffers>
<betOffer>WH</betOffer>
</betOffers>
</race>
</meeting>
<meeting name="Wetherby" id="87" diffusion_course_name="WETHERBY">
<race id="691067" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>12:30</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Racing UK Jump To It Novices' Hurdle</title>
<type>H</type>
<distance>2m3f154y</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>9</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Novice Hurdle</raceDescription>
<tvText>RUK </tvText>
</race>
<race id="691066" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>1:05</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Racing UK In Stunning HD "Confined" Novices' Chase</title>
<type>C</type>
<distance>2m3f85y</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>7</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Novice Chase</raceDescription>
<tvText>RUK </tvText>
</race>
<race id="691068" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>1:40</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Bet At racinguk.com Handicap Hurdle</title>
<type>H</type>
<distance>2m</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>9</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Handicap Hurdle</raceDescription>
<tvText>RUK </tvText>
</race>
<race id="691063" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>2:15</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>totescoop6 Play Today Handicap Chase</title>
<type>C</type>
<distance>1m7f36y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>5</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 2 Handicap Chase</raceDescription>
<tvText>RUK </tvText>
</race>
<race id="691064" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>2:50</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>totescoop6 Results On totepoolliveinfo.com Handicap Hurdle</title>
<type>H</type>
<distance>2m3f154y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>11</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 3 Handicap Hurdle</raceDescription>
<tvText>RUK </tvText>
</race>
<race id="691065" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>3:25</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Book Now For Medieval Day - 3rd February Handicap Chase (Northern Lights Middle Distance Series)</title>
<type>C</type>
<distance>2m3f85y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>7</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Handicap Chase</raceDescription>
<tvText>RUK </tvText>
<betOffers>
<betOffer>LB</betOffer>
</betOffers>
</race>
<race id="691069" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>3:55</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Racing UK On Sky 432 Fillies' "Junior" Standard Open National Hunt Flat Race</title>
<type>B</type>
<distance>1m4f77y</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>8</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 5 NHF</raceDescription>
<tvText>RUK </tvText>
</race>
</meeting>
<meeting name="Wolverhampton (AW)" id="513" diffusion_course_name="WOLVERHAMPTON">
<race id="691141" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>5:45</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Bet & Watch At sunbets.co.uk Apprentice Handicap</title>
<type>X</type>
<distance>1m142y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>13</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 6 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="691136" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>6:15</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>sunbets.co.uk Handicap</title>
<type>X</type>
<distance>1m142y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>9</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="691140" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>6:45</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betway Live Casino Handicap</title>
<type>X</type>
<distance>2m120y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>13</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 6 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="691138" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>7:15</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betway Casino Handicap (Div I)</title>
<type>X</type>
<distance>6f20y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>13</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 5 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="692653" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>7:45</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betway Casino Handicap (Div II)</title>
<type>X</type>
<distance>6f20y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>12</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 5 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="691139" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>8:15</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>Betway Novice Stakes</title>
<type>X</type>
<distance>5f21y</distance>
<group/>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>6</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 5 Novice</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="691137" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>8:45</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>32Red.com Handicap</title>
<type>X</type>
<distance>5f21y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>7</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 4 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
<race id="691142" perform_race_id="" perform_race_id_atr="" details_available="1" race_status_code="R">
<time>9:15</time>
<date>2018-01-13</date>
<ampm>pm</ampm>
<title>32Red Casino Handicap</title>
<type>X</type>
<distance>1m1f104y</distance>
<group>Handicap</group>
<tipsAllowed>1</tipsAllowed>
<predictorAllowed>1</predictorAllowed>
<bettingLink>1</bettingLink>
<declaredRunners>11</declaredRunners>
<liveCommentary>1</liveCommentary>
<liveTab>1</liveTab>
<raceDescription>Class 6 Handicap</raceDescription>
<tvText>ATR </tvText>
</race>
</meeting>
</cards>
</root>
I can get the data i want which is essentially the race data (child nodes) by running this R code:
CardList=cbind(
date,
data.frame(raceid=xpathSApply(CardList_tmp, "//meeting/race", xmlGetAttr, 'id')),
data.frame(cards=xpathSApply(CardList_tmp, "//meeting/race", xmlGetAttr, 'details_available')),
data.frame(status=xpathSApply(CardList_tmp, "//meeting/race", xmlGetAttr, 'race_status_code')),
xmlToDataFrame(nodes = getNodeSet(CardList_tmp, "//meeting/race"))
)
However it doesn't contain the meeting data which is held at the parent attribute level:
course = xpathSApply(CardList_tmp, "//meeting", xmlGetAttr, 'name')
cid = xpathSApply(CardList_tmp, "//meeting", xmlGetAttr, 'id')
Is there a way i can combine the two sets of code together to provide one dataframe and in one step?
Upvotes: 2
Views: 3277
Reputation: 43334
Here's options with xml2 for XML handling and the tidyverse for munging. The attributes (xml_attrs
returns a named character vector), node names, and node values can be read into a three-element list that can be coerced to a data frame:
library(tidyverse)
library(xml2)
x <- read_xml('races.xml')
races <- x %>%
xml_find_all('//race') %>%
map_dfr(~list(attrs = list(xml_attrs(.x)),
variable = list(map(xml_children(.x), xml_name)),
value = list(map(xml_children(.x), xml_text))))
races
#> # A tibble: 29 x 3
#> attrs variable value
#> <list> <list> <list>
#> 1 <chr [5]> <list [15]> <list [15]>
#> 2 <chr [5]> <list [15]> <list [15]>
#> 3 <chr [5]> <list [15]> <list [15]>
#> 4 <chr [5]> <list [15]> <list [15]>
#> 5 <chr [5]> <list [15]> <list [15]>
#> 6 <chr [5]> <list [15]> <list [15]>
#> 7 <chr [5]> <list [15]> <list [15]>
#> 8 <chr [5]> <list [16]> <list [16]>
#> 9 <chr [5]> <list [16]> <list [16]>
#> 10 <chr [5]> <list [16]> <list [16]>
#> # ... with 19 more rows
which can in turn be cleaned up with a lot of tidyr:
races_tidy <- races %>%
mutate(attr_names = map(attrs, names)) %>%
unnest(attr_names, attrs, .drop = FALSE) %>%
spread(attr_names, attrs) %>%
unnest(variable, value) %>%
unnest(variable, value) %>%
spread(variable, value) %>%
type_convert() # fix variable types
This works, but the unnesting and spreading is fragile. Writing a more robust method is actually not too much more work, though, as you can just arrange the list columns before unnesting:
races_tidy2 <- races %>%
mutate(attrs = map(attrs, ~as_tibble(as.list(.x))),
data = map2(variable, value, ~as_tibble(set_names(.y, .x)))) %>%
unnest(attrs, data, .drop = TRUE) %>%
type_convert()
The most direct approach is to do the rearranging right while iterating over nodes. This is most concise and likely most efficient approach, but writing it correctly relies on careful manipulation of the data structures, so writing viable code may take longer.
races_tidy3 <- x %>%
xml_find_all('//race') %>%
map_dfr(~flatten(c(xml_attrs(.x),
map(xml_children(.x),
~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
type_convert()
races_tidy3
#> # A tibble: 29 x 21
#> id perf… perf… deta… race… time date ampm title type dist…
#> <int> <chr> <chr> <int> <chr> <tim> <date> <chr> <chr> <chr> <chr>
#> 1 692415 <NA> <NA> 1 R 12:25 2018-01-13 pm Adar… C 2m4f
#> 2 692416 <NA> <NA> 1 R 01:00 2018-01-13 pm Tota… C 2m4f
#> 3 692417 <NA> <NA> 1 R 01:35 2018-01-13 pm Conn… C 3m1f
#> 4 692418 <NA> <NA> 1 R 02:10 2018-01-13 pm Sky … H 2m
#> 5 692419 <NA> <NA> 1 R 02:45 2018-01-13 pm Spor… H 2m
#> 6 692420 <NA> <NA> 1 R 03:20 2018-01-13 pm Lein… H 2m4f…
#> 7 692421 <NA> <NA> 1 R 03:50 2018-01-13 pm Davi… B 2m
#> 8 691061 <NA> <NA> 1 R 12:40 2018-01-13 pm Betf… H 2m
#> 9 691060 <NA> <NA> 1 R 01:15 2018-01-13 pm Betf… C 2m54y
#> 10 691058 <NA> <NA> 1 R 01:50 2018-01-13 pm Betf… C 3m
#> # ... with 19 more rows, and 10 more variables: group <chr>, tipsAllowed
#> # <int>, predictorAllowed <int>, bettingLink <int>, declaredRunners
#> # <int>, liveCommentary <int>, liveTab <int>, raceDescription <chr>,
#> # tvText <chr>, betOffers <chr>
All return the same data, though column order is different for races_tidy
.
all_equal(races_tidy, races_tidy2)
#> [1] TRUE
identical(races_tidy2, races_tidy3)
#> [1] TRUE
Upvotes: 5
Reputation: 107567
Alternatively, consider XSLT, the special-purpose langauge designed specifically to transform XML files such as flatter, simpler ones for your R needs. R can run XSLT 1.0 scripts with the xslt
third-party package (extension of xml2
).
But also, XSLT is portable and can be run even outside R with Java, Python, PHP, or dedicated executables such as Saxon and Xalan. Below shows a system
call to Unix's xsltproc. There is a similar batch call available for Windows. Once simplified, pass the new XML using XML's xmlToDataframe
.
Specifically, XSLT below parses down to race level and pulls meeting data from parent node.
XSLT (save as .xsl, a well-formed .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="/root/cards">
<xsl:copy>
<xsl:apply-templates select="meeting"/>
</xsl:copy>
</xsl:template>
<xsl:template match="meeting">
<xsl:apply-templates select="race"/>
</xsl:template>
<xsl:template match="race">
<xsl:copy>
<meeting_id><xsl:value-of select="ancestor::meeting/@id"/></meeting_id>
<meeting_name><xsl:value-of select="ancestor::meeting/@name"/></meeting_name>
<xsl:apply-templates select="@*"/>
<xsl:copy-of select="*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="race/@*">
<xsl:element name="{name(.)}"><xsl:value-of select="."/></xsl:element>
</xsl:template>
</xsl:stylesheet>
R
library(XML)
library(xslt)
# LOAD XML AND XSL
input <- read_xml("/path/to/input.xml", package = "xslt")
style <- read_xml("/path/to/xslt_script.xsl", package = "xslt")
# TRANSFORM INPUT INTO OUTPUT
new_xml <- xml_xslt(input, style)
output <- as.character(new_xml)
# PARSE OUTPUT FROM STRING
doc <- xmlParse(output, asText=TRUE)
# COMMAND LINE CALL TO UNIX'S XSLTPROC (ALTERNATIVE TO xslt PACKAGE)
system("xsltproc -o /path/to/input.xml /path/to/xslt_script.xsl /path/to/output.xml")
doc <- xmlParse("/path/to/output.xml")
# BUILD DATAFRAME
df <- xmlToDataFrame(doc, nodes=getNodeSet(doc, '//race'))
Upvotes: 0
Reputation: 107567
Consider parsing meeting data by node index and expand it to the number of its child race elements, then column bind with race data:
doc <- xmlParse("/path/to/Source.xml")
# NUMBER OF MEETING NODES
mtg_num <- length(xpathSApply(doc, "//meeting"))
# DATAFRAME LIST OF EXPANDED MEETING ATTRS
meeting_list <- lapply(seq(mtg_num), function(i) {
races_num <- length(xpathSApply(doc, sprintf("//meeting[%s]/race", i)))
data.frame(
meeting_id = rep(xpathSApply(doc, sprintf("//meeting[%s]/@id", i)), races_num),
meeting_name = rep(xpathSApply(doc, sprintf("//meeting[%s]/@name", i)), races_num)
)
})
# COLUMN BIND MEETING NODES, RACE NODES, AND RACE ATTRS
final_df <- cbind(do.call(rbind, meeting_list),
xmlToDataFrame(nodes = getNodeSet(doc, "//meeting/race")),
XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//meeting/race")))
Output
head(final_df)
# meeting_id meeting_name time date ampm title type distance group tipsAllowed predictorAllowed
# 1 195 Punchestown (IRE) 12:25 2018-01-13 pm Adare Manor Opportunity Handicap Chase C 2m4f Handicap 1 1
# 2 195 Punchestown (IRE) 1:00 2018-01-13 pm Total Event Rental (Kildare) Novice Chase (Grade 3) C 2m4f Grade 3 1 1
# 3 195 Punchestown (IRE) 1:35 2018-01-13 pm Connolly's RED MILLS Amateur National (Q.R.) Handicap Chase C 3m1f Handicap 1 1
# 4 195 Punchestown (IRE) 2:10 2018-01-13 pm Sky Bet Moscow Flyer Novice Hurdle (Grade 2) H 2m Grade 2 1 1
# 5 195 Punchestown (IRE) 2:45 2018-01-13 pm Sportinglife.com Maiden Hurdle H 2m 1 1
# 6 195 Punchestown (IRE) 3:20 2018-01-13 pm Leinster Leader Mares Handicap Hurdle H 2m4f40y Handicap 1 1
# bettingLink declaredRunners liveCommentary liveTab raceDescription tvText betOffers id perform_race_id perform_race_id_atr details_available race_status_code
# 1 1 10 1 1 Handicap Chase ATR <NA> 692415 1 R
# 2 1 7 1 1 Novice Chase Grade 3 ATR <NA> 692416 1 R
# 3 1 12 1 1 Handicap Chase ATR <NA> 692417 1 R
# 4 1 7 1 1 Novice Hurdle Grade 2 ATR <NA> 692418 1 R
# 5 1 17 1 1 Maiden Hurdle ATR <NA> 692419 1 R
# 6 1 8 1 1 Handicap Hurdle ATR <NA> 692420 1 R
Upvotes: 1