Glomek
Glomek

Reputation: 71

Retriving parent attribute of every child in XML with xpathSApply

I'm trying to retrive two vectors of same length, one with attributes of childs, and second with attributes of corresponding parents. Example file:

countries.xml <- "<country>
              <city id='1'>
                <place id='1.1'> xxx </place>
                <place id='1.2'> xxx </place>
                <place id='1.3'> xxx </place>
              </city>
              <city id='2'>
                <place id='2.1'> xxx </place>
                <place id='2.2'> xxx </place>
                <place id='2.3'> xxx </place>
              </city>
           </country>"

My code so far

library("XML")
doc = xmlTreeParse(countries.xml, useInternalNodes = T)
xpathSApply(doc, path = "//city/place/@id")
xpathSApply(doc, path = "//city/place/parent::*/@id")

I was hoping to end up with such vectors (named)

"1.1" "1.2" "1.3" "2.1" "2.2" "2.3"
"1" "1" "1" "2" "2" "2"

but instead the second path produces

"1" "2" 

I cound get what I wanted with a loop

library(glue)
place_id <- unname(xpathSApply(doc, path = "//city/place/@id"))
city_id <- vector()
for(i in place_id){
  city_id <- c(city_id,unname(xpathSApply(doc, path = glue("//city/place[@id={i}]/parent::*/@id"))))
}
city_id
"1" "1" "1" "2" "2" "2"

but it is very inefficient and takes ages with large xml.file I'm dealing with. I'm sure there is a way to get what I need with right path in xpathSApply but couldn't find it, so could please someone enlight me :)?

UPDATE @Wietze314 solution works great on my simple example, but i can't adapt it to more complicated xml file. I did manage to change his code to deal with below example

countries.xml <- "<continent>
          <country id='c1'>
          <city id='1'>
            <place id='1.1'> xxx </place>
            <place id='1.2'> xxx </place>
            <place id='1.3'> xxx </place>
          </city>
          <city id='2'>
            <place id='2.1'> xxx </place>
            <place id='2.2'> xxx </place>
            <place id='2.3'> xxx </place>
          </city>
       </country>
       <country id=c2'>
          <city id='1'>
            <place id='1.1'> xxx </place>
            <place id='1.2'> xxx </place>
            <place id='1.3'> xxx </place>
          </city>
          <city id='2'>
            <place id='2.1'> xxx </place>
            <place id='2.2'> xxx </place>
            <place id='2.3'> xxx </place>
          </city>
       </country>
        </continent>"

this code

    pmap_df(list(
  xml_children(cntry) %>% map(xml_children) %>% 
    map(xml_attr,'id') %>% unlist() %>% as.list() %>%
    map(~as_tibble(.) %>% select(city = value)),
    xml_children(cntry) %>% xml_children() %>% map(xml_children) %>% 
    map(xml_attr,'id') %>%
    map(~as_tibble(.) %>% select(place = value))),cbind)

returns this

    city place
1     1   1.1
2     1   1.2
3     1   1.3
4     2   2.1
5     2   2.2
6     2   2.3
7     3   3.1
8     3   3.2
9     3   3.3
10    4   4.1
11    4   4.2
12    4   4.3

but the same code applied to file of my interest fails :( any suggestion?

pfile <- http://nextbike.net/maps/nextbike-official.xml",
                  useInternalNodes = T)
pmap_df(list(
  xml_children(pfile) %>% map(xml_children) %>% 
    map(xml_attr,'uid') %>% unlist() %>% as.list() %>%
    map(~as_tibble(.) %>% select(city = value)),
  xml_children(pfile) %>% xml_children() %>% map(xml_children) %>% 
    map(xml_attr,'uid') %>%
    map(~as_tibble(.) %>% select(place = value))),cbind) 

Error in data.frame(..., check.names = FALSE) : 
      arguments imply differing number of rows: 1, 0

Upvotes: 0

Views: 255

Answers (1)

Wietze314
Wietze314

Reputation: 6020

A solution with tidyverse and xml2

require(xml2)
require(tidyverse)

cntry <- read_xml(countries.xml)


pmap_df(list(
  xml_children(cntry) %>% map(xml_attr,'id') %>% 
    map(~as_tibble(.) %>% select(country = value)),
  xml_children(cntry) %>% map(xml_children) %>% 
    map(xml_attr,'id') %>% 
    map(~as_tibble(.) %>% select(place = value))
  ),cbind)

EDIT:

I have tried to get this to work with more than 2 levels, but did not succeed. This is what I have come up so far:

require(xml2)
require(tidyverse)

parsedxml <- read_xml(countries.xml)

get_ids <- function(xml){
  xml %>% xml_attr('id') %>% 
    map(~as_tibble(.))
}

country <- parsedxml %>% xml_children() %>% map(get_ids)
city <- parsedxml %>% xml_children() %>% map(~xml_children(.) %>% map(get_ids))
place <- parsedxml %>% xml_children() %>% map(~xml_children(.) %>% map(~xml_children(.) %>% map(get_ids)))

rbind(country[[1]],rbind(city[[1]][[1]],place[[1]][[1]])) %>% apply(1,unlist)

result for one city

      [,1] [,2] [,3] 
value "c1" "1"  "1.1"
value "c1" "1"  "1.2"
value "c1" "1"  "1.3"

this ugly code brings it all together:

do.call(rbind,lapply(1:2,function(x) 
  lapply(1:2,function(y) 
    rbind(country[[x]],rbind(city[[x]][[y]],place[[x]][[y]])) %>% apply(1,unlist)) %>% do.call(rbind,.)))

Hopefully someone else has a better solution for this last part.

Upvotes: 1

Related Questions