Reputation: 31
I have a very big dataset (about 20 GB) in xml format and I need to parse it into tidy tables to make some statistics. I want to save:
- attribute tag for tag controlfield
- attribute tag for tag datafield
- attribute code for tag subfield
- values that correspond to all these attributes/tags
Folowing example from here (R XML - combining parent and child nodes into data frame) I wrote my own script. It works correctly but it is very very slow... I have more than 1 million records so the script is not practical but I don't know how to optimize it.
Is there a way to improve the performance of my code or is there another way to solve the problem. For me it is not important what package to use (xml2 or xml or some other). What I really need is to convert my data into tidy tables as fast as possible.
Here is my data, some comments to data and the code.
Thank you for all possible help.
Data
library(xml2)
library(tidyverse)
df <- read_xml(
'<collection>
<record>
<controlfield tag="001">1</controlfield>
<controlfield tag="005">20180109004353.0</controlfield>
<datafield tag="020" ind1=" " ind2=" ">
<subfield code="a">9780821812419</subfield>
<subfield code="b">print</subfield>
</datafield>
<datafield tag="020" ind1=" " ind2=" ">
<subfield code="a">9780821899854</subfield>
<subfield code="b">online</subfield>
</datafield>
<datafield tag="024" ind1="7" ind2=" ">
<subfield code="2">DOI</subfield>
<subfield code="a">10.1090/memo/0041</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="9">SPIRESTeX</subfield>
<subfield code="a">Wong:1961af</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="9">MSNET</subfield>
<subfield code="a">0383311</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="9">ZBLATT</subfield>
<subfield code="a">0124.13401</subfield>
</datafield>
<datafield tag="650" ind1="1" ind2="7">
<subfield code="2">INSPIRE</subfield>
<subfield code="a">Math and Math Physics</subfield>
</datafield>
<datafield tag="773" ind1=" " ind2=" ">
<subfield code="c">1-112</subfield>
<subfield code="p">Mem.Am.Math.Soc.</subfield>
<subfield code="v">41</subfield>
<subfield code="y">1961</subfield>
<subfield code="1">1214590</subfield>
</datafield>
<datafield tag="970" ind1=" " ind2=" ">
<subfield code="a">SPIRES-60</subfield>
</datafield>
<datafield tag="980" ind1=" " ind2=" ">
<subfield code="a">Book</subfield>
</datafield>
<datafield tag="980" ind1=" " ind2=" ">
<subfield code="a">Citeable</subfield>
</datafield>
<datafield tag="980" ind1=" " ind2=" ">
<subfield code="a">HEP</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">1</subfield>
<subfield code="s">Ann.Math.,43,161</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">2</subfield>
<subfield code="s">Am.Math.Mon.,53,136</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="h">E. Cartan</subfield>
<subfield code="m">Paris</subfield>
<subfield code="o">3</subfield>
<subfield code="t">Lecons sur la theorie des spineurs I</subfield>
<subfield code="y">1938</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">4</subfield>
<subfield code="s">Enseign.Math.,40,26</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="s">Annals Math.,47,510</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">6</subfield>
<subfield code="s">Comment.Math.Helv.,15,358</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">9</subfield>
<subfield code="s">Trans.Am.Math.Soc.,61,193</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">10</subfield>
<subfield code="s">Math.Ann.,88,1</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">12</subfield>
<subfield code="s">Math.Ann.,60,546</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">14</subfield>
<subfield code="s">Comment.Math.Helv.,21,261</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">17</subfield>
<subfield code="s">Abh.Math.Sem.Univ.Hamburg,1,1</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">18</subfield>
<subfield code="s">Am.J.Math.,67,472</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">20</subfield>
<subfield code="s">Comment.Math.Helv.,8,3</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="9">CURATOR</subfield>
<subfield code="o">21</subfield>
<subfield code="s">Tohoku Math.J.,3,322</subfield>
</datafield>
<datafield tag="909" ind1="C" ind2="O">
<subfield code="o">oai:inspirehep.net:1</subfield>
<subfield code="p">INSPIRE:HEP</subfield>
</datafield>
<datafield tag="961" ind1=" " ind2=" ">
<subfield code="x">2001-02-23</subfield>
<subfield code="c">2018-01-09</subfield>
</datafield>
<datafield tag="245" ind1=" " ind2=" ">
<subfield code="a">Isoclinic N planes in Euclidean 2N space, Clifford parallels in elliptic (2N-1) space, and the Hurwitz matrix equations</subfield>
</datafield>
<datafield tag="246" ind1=" " ind2=" ">
<subfield code="9">arXiv</subfield>
<subfield code="a">Isoclinic n-planes in Euclidean 2n-space, Clifford parallels in elliptic (2n-1)-space, and the Hurwitz matrix equations</subfield>
</datafield>
<datafield tag="100" ind1=" " ind2=" ">
<subfield code="a">Wong, Yung-Chow</subfield>
<subfield code="u">Hong Kong U.</subfield>
<subfield code="w">Y.C.Wong.1</subfield>
<subfield code="y">0</subfield>
<subfield code="z">908370</subfield>
</datafield>
<datafield tag="269" ind1=" " ind2=" ">
<subfield code="c">1961</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
<subfield code="a">121</subfield>
</datafield>
<datafield tag="500" ind1=" " ind2=" ">
<subfield code="a">Second printing with corrections 1971 (ISBN: 0-8218-1241-6).</subfield>
</datafield>
<datafield tag="520" ind1=" " ind2=" ">
<subfield code="a">Introduction Part I. Isoclinic nn-planes in E2nE2n and Clifford parallel</subfield>
</datafield>
</record>
<record>
<controlfield tag="001">17</controlfield>
<controlfield tag="005">20190325225547.0</controlfield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="9">SPIRESTeX</subfield>
<subfield code="a">Breymayer:1963bh</subfield>
</datafield>
<datafield tag="037" ind1=" " ind2=" ">
<subfield code="a">SLAC-TN-63-023</subfield>
</datafield>
<datafield tag="100" ind1=" " ind2=" ">
<subfield code="a">Breymayer, K.E.</subfield>
<subfield code="u">SLAC</subfield>
<subfield code="w">K.E.Breymayer.1</subfield>
<subfield code="y">0</subfield>
<subfield code="z">903206</subfield>
</datafield>
<datafield tag="245" ind1=" " ind2=" ">
<subfield code="a">SYSTEM INTERCONNECTION AND PROTECTION OF THE MODULATOR</subfield>
</datafield>
<datafield tag="269" ind1=" " ind2=" ">
<subfield code="c">1963-03</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
<subfield code="a">21</subfield>
</datafield>
<datafield tag="650" ind1="1" ind2="7">
<subfield code="2">INSPIRE</subfield>
</datafield>
<datafield tag="700" ind1=" " ind2=" ">
<subfield code="a">Mallory, Kenneth B.</subfield>
<subfield code="u">SLAC</subfield>
<subfield code="w">K.B.Mallory.1</subfield>
<subfield code="y">0</subfield>
<subfield code="z">903206</subfield>
</datafield>
<datafield tag="710" ind1=" " ind2=" ">
<subfield code="a">SLAC</subfield>
</datafield>
<datafield tag="856" ind1="4" ind2=" ">
<subfield code="u">http://www-public.slac.stanford.edu/sciDoc/docMeta.aspx?slacPubNumber=slac-tn-63-023</subfield>
<subfield code="w">slac-tn-63-023</subfield>
<subfield code="y">SLAC</subfield>
</datafield>
<datafield tag="856" ind1="4" ind2=" ">
<subfield code="s">1903536</subfield>
<subfield code="u">http://inspirehep.net/record/17/files/slac-tn-63-023.pdf</subfield>
</datafield>
<datafield tag="909" ind1="C" ind2="O">
<subfield code="o">oai:inspirehep.net:17</subfield>
<subfield code="p">INSPIRE:HEP</subfield>
</datafield>
<datafield tag="961" ind1=" " ind2=" ">
<subfield code="x">1963-03-01</subfield>
<subfield code="c">2019-03-25</subfield>
</datafield>
<datafield tag="970" ind1=" " ind2=" ">
<subfield code="a">SPIRES-329</subfield>
</datafield>
<datafield tag="980" ind1=" " ind2=" ">
<subfield code="a">HEP</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="6">
<subfield code="a">0-0-0-0-0-0-1</subfield>
<subfield code="t">2014-08-13 15:34:53</subfield>
<subfield code="v">Invenio/1.1.2.1260-aa76f refextract/1.5.44</subfield>
<subfield code="v">content.pdf;1</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="m">will open, thus o~nlng the ac cQxuit to the -</subfield>
<subfield code="o">1</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="m">2</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="m">Apply the trigger. If the high voltage is present, the klystron is pulsed and draws average current</subfield>
<subfield code="o">3</subfield>
<subfield code="t">relay comes on. In addI.tion, a light at the modulator comes on, indicating Klystron Current</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="m">no recycling</subfield>
<subfield code="o">3</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="m">removes the 260-6~0 vous l (2) Resetting: close circuit breaker only, no further action necessary if fault is cleared.</subfield>
<subfield code="o">3</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="m">% u --L--_ I ‘2 Q-4 -r---.‘-i-’I ___----_.--_.-_. -.L.-</subfield>
<subfield code="o">4</subfield>
</datafield>
</record>
<record>
<controlfield tag="001">18</controlfield>
<controlfield tag="005">20140813111316.0</controlfield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">Johnston:1977saa</subfield>
<subfield code="9">INSPIRETeX</subfield>
</datafield>
<datafield tag="037" ind1=" " ind2=" ">
<subfield code="a">SLAC-PUB-0341</subfield>
</datafield>
<datafield tag="100" ind1=" " ind2=" ">
<subfield code="a">Johnston, Lawrence II.</subfield>
<subfield code="u">SLAC</subfield>
<subfield code="w">L.H.Johnston.1</subfield>
<subfield code="y">0</subfield>
<subfield code="z">903206</subfield>
</datafield>
<datafield tag="245" ind1=" " ind2=" ">
<subfield code="a">A target beam monitor using secondary electron emission</subfield>
</datafield>
<datafield tag="269" ind1=" " ind2=" ">
<subfield code="c">1977-08</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
<subfield code="a">17</subfield>
</datafield>
<datafield tag="700" ind1=" " ind2=" ">
<subfield code="a">Murray, Joseph J.</subfield>
<subfield code="u">SLAC</subfield>
<subfield code="w">J.J.Murray.1</subfield>
<subfield code="y">0</subfield>
<subfield code="z">903206</subfield>
</datafield>
<datafield tag="700" ind1=" " ind2=" ">
<subfield code="a">Gearhart, Roger A.</subfield>
<subfield code="u">LBL, Berkeley</subfield>
<subfield code="w">R.Gearhart.1</subfield>
<subfield code="y">0</subfield>
<subfield code="z">902953</subfield>
</datafield>
<datafield tag="700" ind1=" " ind2=" ">
<subfield code="a">Hauser, Todd</subfield>
<subfield code="u">LBL, Berkeley</subfield>
<subfield code="w">T.Hauser.2</subfield>
<subfield code="y">0</subfield>
<subfield code="z">902953</subfield>
</datafield>
<datafield tag="700" ind1=" " ind2=" ">
<subfield code="a">Klein, Paul R.</subfield>
<subfield code="u">Purdue U.</subfield>
<subfield code="w">P.Klein.2</subfield>
<subfield code="y">0</subfield>
<subfield code="z">903142</subfield>
</datafield>
<datafield tag="856" ind1="4" ind2=" ">
<subfield code="w">0341</subfield>
<subfield code="y">SLACPUB</subfield>
<subfield code="u">http://www-public.slac.stanford.edu/sciDoc/docMeta.aspx?slacPubNumber=SLAC-PUB-0341</subfield>
</datafield>
<datafield tag="856" ind1="4" ind2=" ">
<subfield code="s">1110230</subfield>
<subfield code="u">http://inspirehep.net/record/18/files/slac-pub-0341.pdf</subfield>
</datafield>
<datafield tag="909" ind1="C" ind2="O">
<subfield code="o">oai:inspirehep.net:18</subfield>
<subfield code="p">INSPIRE:HEP</subfield>
</datafield>
<datafield tag="961" ind1=" " ind2=" ">
<subfield code="x">1977-01-01</subfield>
<subfield code="c">2014-08-13</subfield>
</datafield>
<datafield tag="970" ind1=" " ind2=" ">
<subfield code="a">SPIRES-337</subfield>
</datafield>
<datafield tag="980" ind1=" " ind2=" ">
<subfield code="a">HEP</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="s">Rev.Sci.Instrum.,26,229</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="s">IEEE Trans.Nucl.Sci.,12,935</subfield>
<subfield code="0">48647</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="s">Nucl.Instrum.Meth.,15,63</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="s">Phys.Rev.Lett.,18,366</subfield>
</datafield>
</record>
</collection>')
Comments to tags:
<controlfield tag="001">This is identifier of the record</controlfield>
<controlfield tag="005">This is time of last record update</controlfield>
My code
start_t <- Sys.time()
x1 <- df %>% xml_find_all('.//record')
x2 <- x1 %>%
xml_find_all('.//datafield') %>%
map_df(~list(rec = list(map(xml_find_all(.x, './/preceding-sibling::controlfield'), xml_text)),
attrs = list(xml_attr(.x, attr = 'tag')),
variable = list(map(xml_children(.x), xml_attrs)),
value = list(map(xml_children(.x), xml_text))
))
x2_tidy <- x2 %>% unnest(c(variable, value)) %>% mutate(rec = as.character(rec))
Sys.time() - start_t
Update My result is like this. Sorry, I can't figure out how to add text table here, so I add it like an image. Image with resulting table
Upvotes: 3
Views: 654
Reputation: 2722
Okay, so just because i dove down the rabbit hole here... below is working code. I don't know why you would want to structure your output data this way, though, and just in case you later decide on another formatting, I included a col for the name of the node the record(line/row) is associated with not record node
library(xml2)
for all of this: Here's a function to do the work on the attributes we need:
.attrs_targeted <- function(x){
ats <- xml_attrs(x)
ats_array <- unlist(ats)
hits <- grepl("tag|code", names(ats_array), ignore.case = TRUE, perl = TRUE)
ats <- ats_array[hits]
data.frame(
attrs = names(ats),
variable = ats,
row.names = NULL,
stringsAsFactors = FALSE
)
}
Here are the record nodes:
records <- xml_find_all(df, ".//record")
> records
{xml_nodeset (3)}
[1] <record>\n <controlfield tag="001">1</controlfield>\n <controlfield tag="005">20180109004353.0</controlfield>\n <datafield tag="020" ind1=" " ind2=" ">\n <subfield code="a">9780821812419</subfiel ...
[2] <record>\n <controlfield tag="001">17</controlfield>\n <controlfield tag="005">20190325225547.0</controlfield>\n <datafield tag="035" ind1=" " ind2=" ">\n <subfield code="9">SPIRESTeX</subfield>\ ...
[3] <record>\n <controlfield tag="001">18</controlfield>\n <controlfield tag="005">20140813111316.0</controlfield>\n <datafield tag="035" ind1=" " ind2=" ">\n <subfield code="a">Johnston:1977saa</sub ...
>
And here is our iterator:
library(dplyr)
library(jsonlite)
results <- rbind_pages(lapply(records, function(i){
# Controlfield nodes
controls <- xml_find_all(i, ".//controlfield")
# control attributes
ats <- .attrs_targeted(controls)
# control values
c_vals <- xml_text(controls)
# store on a data frame
d <- data.frame(
node_name = xml_name(controls),
record_id = c_vals[[1]],
edit_time = c_vals[[2]],
ats,
value = c_vals
)
# now to the datafields
dfields <- xml_find_all(i, ".//datafield")
# loop through the datafield nodes, and then the subfield nodes, which have code/values we want, but build and
# bind data.frames along the way
d_loop <- rbind_pages(lapply(dfields, function(j){
d_ats <- .attrs_targeted(j)
d2 <- data.frame(
node_name = xml_name(j),
d_ats,
value = NA
)
# Now target the subfields
sfields <- xml_find_all(j, ".//subfield")
d3 <- rbind_pages(Map(function(k){
data.frame(
node_name = xml_name(k),
.attrs_targeted(k),
value = xml_text(k)
)
}, sfields))
rbind_pages(list(d2, d3))
})) %>% mutate(record_id = c_vals[[1]], edit_time = c_vals[[2]]) # add the records back in
rbind_pages(list(d, d_loop))
}))
> head(results)
node_name record_id edit_time attrs variable value
1 controlfield 1 20180109004353.0 tag 001 1
2 controlfield 1 20180109004353.0 tag 005 20180109004353.0
3 datafield 1 20180109004353.0 tag 020 <NA>
4 subfield 1 20180109004353.0 code a 9780821812419
5 subfield 1 20180109004353.0 code b print
6 datafield 1 20180109004353.0 tag 020 <NA>
To peek in at the data:
results %>% filter(attrs == "code") %>% data.table::dcast(record_id ~ variable)
record_id 0 1 2 9 a b c h m o p s t u v w x y z
1 1 0 1 2 18 17 2 3 1 1 14 2 13 1 1 1 1 1 3 1
2 17 0 0 1 1 10 0 2 0 6 6 1 1 2 4 2 3 1 3 2
3 18 1 0 0 1 11 0 2 0 0 1 1 5 0 7 0 6 1 6 5
Upvotes: 1