StackQuestions
StackQuestions

Reputation: 31

Parsing xml data with R. The code is very slow. How to make it faster?

I have a very big dataset (about 20 GB) in xml format and I need to parse it into tidy tables to make some statistics. I want to save:
- attribute tag for tag controlfield
- attribute tag for tag datafield
- attribute code for tag subfield
- values that correspond to all these attributes/tags

Folowing example from here (R XML - combining parent and child nodes into data frame) I wrote my own script. It works correctly but it is very very slow... I have more than 1 million records so the script is not practical but I don't know how to optimize it.
Is there a way to improve the performance of my code or is there another way to solve the problem. For me it is not important what package to use (xml2 or xml or some other). What I really need is to convert my data into tidy tables as fast as possible. Here is my data, some comments to data and the code.

Thank you for all possible help.

Data

library(xml2)
library(tidyverse)
df <- read_xml(
'<collection>
<record>
  <controlfield tag="001">1</controlfield>
  <controlfield tag="005">20180109004353.0</controlfield>
  <datafield tag="020" ind1=" " ind2=" ">
    <subfield code="a">9780821812419</subfield>
    <subfield code="b">print</subfield>
  </datafield>
  <datafield tag="020" ind1=" " ind2=" ">
    <subfield code="a">9780821899854</subfield>
    <subfield code="b">online</subfield>
  </datafield>
  <datafield tag="024" ind1="7" ind2=" ">
    <subfield code="2">DOI</subfield>
    <subfield code="a">10.1090/memo/0041</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="9">SPIRESTeX</subfield>
    <subfield code="a">Wong:1961af</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="9">MSNET</subfield>
    <subfield code="a">0383311</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="9">ZBLATT</subfield>
    <subfield code="a">0124.13401</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="2">INSPIRE</subfield>
    <subfield code="a">Math and Math Physics</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="c">1-112</subfield>
    <subfield code="p">Mem.Am.Math.Soc.</subfield>
    <subfield code="v">41</subfield>
    <subfield code="y">1961</subfield>
    <subfield code="1">1214590</subfield>
  </datafield>
  <datafield tag="970" ind1=" " ind2=" ">
    <subfield code="a">SPIRES-60</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">Book</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">Citeable</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">HEP</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">1</subfield>
    <subfield code="s">Ann.Math.,43,161</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">2</subfield>
    <subfield code="s">Am.Math.Mon.,53,136</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="h">E. Cartan</subfield>
    <subfield code="m">Paris</subfield>
    <subfield code="o">3</subfield>
    <subfield code="t">Lecons sur la theorie des spineurs I</subfield>
    <subfield code="y">1938</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">4</subfield>
    <subfield code="s">Enseign.Math.,40,26</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="s">Annals Math.,47,510</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">6</subfield>
    <subfield code="s">Comment.Math.Helv.,15,358</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">9</subfield>
    <subfield code="s">Trans.Am.Math.Soc.,61,193</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">10</subfield>
    <subfield code="s">Math.Ann.,88,1</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">12</subfield>
    <subfield code="s">Math.Ann.,60,546</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">14</subfield>
    <subfield code="s">Comment.Math.Helv.,21,261</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">17</subfield>
    <subfield code="s">Abh.Math.Sem.Univ.Hamburg,1,1</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">18</subfield>
    <subfield code="s">Am.J.Math.,67,472</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">20</subfield>
    <subfield code="s">Comment.Math.Helv.,8,3</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="9">CURATOR</subfield>
    <subfield code="o">21</subfield>
    <subfield code="s">Tohoku Math.J.,3,322</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="o">oai:inspirehep.net:1</subfield>
    <subfield code="p">INSPIRE:HEP</subfield>
  </datafield>
  <datafield tag="961" ind1=" " ind2=" ">
    <subfield code="x">2001-02-23</subfield>
    <subfield code="c">2018-01-09</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">Isoclinic N planes in Euclidean 2N space, Clifford parallels in elliptic (2N-1) space, and the Hurwitz matrix equations</subfield>
  </datafield>
  <datafield tag="246" ind1=" " ind2=" ">
    <subfield code="9">arXiv</subfield>
    <subfield code="a">Isoclinic n-planes in Euclidean 2n-space, Clifford parallels in elliptic (2n-1)-space, and the Hurwitz matrix equations</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Wong, Yung-Chow</subfield>
    <subfield code="u">Hong Kong U.</subfield>
    <subfield code="w">Y.C.Wong.1</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">908370</subfield>
  </datafield>
  <datafield tag="269" ind1=" " ind2=" ">
    <subfield code="c">1961</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">121</subfield>
  </datafield>
  <datafield tag="500" ind1=" " ind2=" ">
    <subfield code="a">Second printing with corrections 1971 (ISBN: 0-8218-1241-6).</subfield>
  </datafield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">Introduction Part I. Isoclinic nn-planes in E2nE2n and Clifford parallel</subfield>
  </datafield>
</record>

<record>
  <controlfield tag="001">17</controlfield>
  <controlfield tag="005">20190325225547.0</controlfield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="9">SPIRESTeX</subfield>
    <subfield code="a">Breymayer:1963bh</subfield>
  </datafield>
  <datafield tag="037" ind1=" " ind2=" ">
    <subfield code="a">SLAC-TN-63-023</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Breymayer, K.E.</subfield>
    <subfield code="u">SLAC</subfield>
    <subfield code="w">K.E.Breymayer.1</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">903206</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">SYSTEM INTERCONNECTION AND PROTECTION OF THE MODULATOR</subfield>
  </datafield>
  <datafield tag="269" ind1=" " ind2=" ">
    <subfield code="c">1963-03</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">21</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="2">INSPIRE</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Mallory, Kenneth B.</subfield>
    <subfield code="u">SLAC</subfield>
    <subfield code="w">K.B.Mallory.1</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">903206</subfield>
  </datafield>
  <datafield tag="710" ind1=" " ind2=" ">
    <subfield code="a">SLAC</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="u">http://www-public.slac.stanford.edu/sciDoc/docMeta.aspx?slacPubNumber=slac-tn-63-023</subfield>
    <subfield code="w">slac-tn-63-023</subfield>
    <subfield code="y">SLAC</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">1903536</subfield>
    <subfield code="u">http://inspirehep.net/record/17/files/slac-tn-63-023.pdf</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="o">oai:inspirehep.net:17</subfield>
    <subfield code="p">INSPIRE:HEP</subfield>
  </datafield>
  <datafield tag="961" ind1=" " ind2=" ">
    <subfield code="x">1963-03-01</subfield>
    <subfield code="c">2019-03-25</subfield>
  </datafield>
  <datafield tag="970" ind1=" " ind2=" ">
    <subfield code="a">SPIRES-329</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">HEP</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="6">
    <subfield code="a">0-0-0-0-0-0-1</subfield>
    <subfield code="t">2014-08-13 15:34:53</subfield>
    <subfield code="v">Invenio/1.1.2.1260-aa76f refextract/1.5.44</subfield>
    <subfield code="v">content.pdf;1</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="m">will open, thus o~nlng the ac cQxuit to the -</subfield>
    <subfield code="o">1</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="m">2</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="m">Apply the trigger. If the high voltage is present, the klystron is pulsed and draws average current</subfield>
    <subfield code="o">3</subfield>
    <subfield code="t">relay comes on. In addI.tion, a light at the modulator comes on, indicating Klystron Current</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="m">no recycling</subfield>
    <subfield code="o">3</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="m">removes the 260-6~0 vous l (2) Resetting: close circuit breaker only, no further action necessary if fault is cleared.</subfield>
    <subfield code="o">3</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="m">% u --L--_ I ‘2 Q-4 -r---.‘-i-’I ___----_.--_.-_. -.L.-</subfield>
    <subfield code="o">4</subfield>
  </datafield>
</record>

<record>
  <controlfield tag="001">18</controlfield>
  <controlfield tag="005">20140813111316.0</controlfield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="a">Johnston:1977saa</subfield>
    <subfield code="9">INSPIRETeX</subfield>
  </datafield>
  <datafield tag="037" ind1=" " ind2=" ">
    <subfield code="a">SLAC-PUB-0341</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Johnston, Lawrence II.</subfield>
    <subfield code="u">SLAC</subfield>
    <subfield code="w">L.H.Johnston.1</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">903206</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">A target beam monitor using secondary electron emission</subfield>
  </datafield>
  <datafield tag="269" ind1=" " ind2=" ">
    <subfield code="c">1977-08</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">17</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Murray, Joseph J.</subfield>
    <subfield code="u">SLAC</subfield>
    <subfield code="w">J.J.Murray.1</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">903206</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Gearhart, Roger A.</subfield>
    <subfield code="u">LBL, Berkeley</subfield>
    <subfield code="w">R.Gearhart.1</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">902953</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Hauser, Todd</subfield>
    <subfield code="u">LBL, Berkeley</subfield>
    <subfield code="w">T.Hauser.2</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">902953</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Klein, Paul R.</subfield>
    <subfield code="u">Purdue U.</subfield>
    <subfield code="w">P.Klein.2</subfield>
    <subfield code="y">0</subfield>
    <subfield code="z">903142</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="w">0341</subfield>
    <subfield code="y">SLACPUB</subfield>
    <subfield code="u">http://www-public.slac.stanford.edu/sciDoc/docMeta.aspx?slacPubNumber=SLAC-PUB-0341</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">1110230</subfield>
    <subfield code="u">http://inspirehep.net/record/18/files/slac-pub-0341.pdf</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="o">oai:inspirehep.net:18</subfield>
    <subfield code="p">INSPIRE:HEP</subfield>
  </datafield>
  <datafield tag="961" ind1=" " ind2=" ">
    <subfield code="x">1977-01-01</subfield>
    <subfield code="c">2014-08-13</subfield>
  </datafield>
  <datafield tag="970" ind1=" " ind2=" ">
    <subfield code="a">SPIRES-337</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">HEP</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="s">Rev.Sci.Instrum.,26,229</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="s">IEEE Trans.Nucl.Sci.,12,935</subfield>
    <subfield code="0">48647</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="s">Nucl.Instrum.Meth.,15,63</subfield>
  </datafield>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="s">Phys.Rev.Lett.,18,366</subfield>
  </datafield>
</record>
</collection>')

Comments to tags:
<controlfield tag="001">This is identifier of the record</controlfield>
<controlfield tag="005">This is time of last record update</controlfield>

My code

start_t <- Sys.time()
x1 <- df %>% xml_find_all('.//record')

x2 <- x1 %>% 
    xml_find_all('.//datafield') %>% 
    map_df(~list(rec      = list(map(xml_find_all(.x, './/preceding-sibling::controlfield'), xml_text)),
                 attrs    = list(xml_attr(.x, attr = 'tag')), 
                 variable = list(map(xml_children(.x), xml_attrs)), 
                 value    = list(map(xml_children(.x), xml_text))
                 ))

x2_tidy <- x2 %>% unnest(c(variable, value)) %>% mutate(rec = as.character(rec))
Sys.time() - start_t

Update My result is like this. Sorry, I can't figure out how to add text table here, so I add it like an image. Image with resulting table

Upvotes: 3

Views: 654

Answers (1)

Carl Boneri
Carl Boneri

Reputation: 2722

Okay, so just because i dove down the rabbit hole here... below is working code. I don't know why you would want to structure your output data this way, though, and just in case you later decide on another formatting, I included a col for the name of the node the record(line/row) is associated with not record node

  • Note i am using library(xml2) for all of this:

Here's a function to do the work on the attributes we need:

.attrs_targeted <- function(x){
  ats <- xml_attrs(x)
  ats_array <- unlist(ats)
  hits <- grepl("tag|code", names(ats_array), ignore.case = TRUE, perl = TRUE)
  ats <- ats_array[hits]
  data.frame(
    attrs = names(ats),
    variable = ats,
    row.names = NULL,
    stringsAsFactors = FALSE
  )
}

Here are the record nodes:

records <- xml_find_all(df, ".//record")
> records
{xml_nodeset (3)}
[1] <record>\n  <controlfield tag="001">1</controlfield>\n  <controlfield tag="005">20180109004353.0</controlfield>\n  <datafield tag="020" ind1=" " ind2=" ">\n    <subfield code="a">9780821812419</subfiel ...
[2] <record>\n  <controlfield tag="001">17</controlfield>\n  <controlfield tag="005">20190325225547.0</controlfield>\n  <datafield tag="035" ind1=" " ind2=" ">\n    <subfield code="9">SPIRESTeX</subfield>\ ...
[3] <record>\n  <controlfield tag="001">18</controlfield>\n  <controlfield tag="005">20140813111316.0</controlfield>\n  <datafield tag="035" ind1=" " ind2=" ">\n    <subfield code="a">Johnston:1977saa</sub ...
> 

And here is our iterator:

library(dplyr)
library(jsonlite)

results <- rbind_pages(lapply(records, function(i){
  # Controlfield nodes
  controls <- xml_find_all(i, ".//controlfield")
  # control attributes
  ats <- .attrs_targeted(controls)
  # control values
  c_vals <- xml_text(controls)
  # store on a data frame
  d <- data.frame(
    node_name = xml_name(controls),
    record_id = c_vals[[1]], 
    edit_time = c_vals[[2]],
    ats, 
    value = c_vals
  )
  # now to the datafields
  dfields <- xml_find_all(i, ".//datafield")
  # loop through the datafield nodes, and then the subfield nodes, which have code/values we want, but build and 
  # bind data.frames along the way
  d_loop <- rbind_pages(lapply(dfields, function(j){
    d_ats <- .attrs_targeted(j)
    d2 <- data.frame(
      node_name = xml_name(j),
      d_ats,
      value = NA
    )
    # Now target the subfields
    sfields <- xml_find_all(j, ".//subfield")
    d3 <- rbind_pages(Map(function(k){
      data.frame(
        node_name = xml_name(k),
        .attrs_targeted(k),
        value = xml_text(k)
      )
    }, sfields))
    rbind_pages(list(d2, d3))
  })) %>% mutate(record_id = c_vals[[1]], edit_time = c_vals[[2]]) # add the records back in

  rbind_pages(list(d, d_loop))
}))

> head(results)
     node_name record_id        edit_time attrs variable            value
1 controlfield         1 20180109004353.0   tag      001                1
2 controlfield         1 20180109004353.0   tag      005 20180109004353.0
3    datafield         1 20180109004353.0   tag      020             <NA>
4     subfield         1 20180109004353.0  code        a    9780821812419
5     subfield         1 20180109004353.0  code        b            print
6    datafield         1 20180109004353.0   tag      020             <NA>

To peek in at the data:

 results %>% filter(attrs == "code") %>% data.table::dcast(record_id ~ variable)
  record_id 0 1 2  9  a b c h m  o p  s t u v w x y z
1         1 0 1 2 18 17 2 3 1 1 14 2 13 1 1 1 1 1 3 1
2        17 0 0 1  1 10 0 2 0 6  6 1  1 2 4 2 3 1 3 2
3        18 1 0 0  1 11 0 2 0 0  1 1  5 0 7 0 6 1 6 5

Upvotes: 1

Related Questions