Reputation: 3650
I have large XML with PropertyItemData nodes that can contain ObjectList node. I want to get list/vector of ObjectList subnode count for each PropertyItemData.
Generation of example data:
doc <-
'
<a>
<PropertyItemData>
<ObjectList>
<ObjectData><z>1</z></ObjectData>
</ObjectList>
</PropertyItemData>
<PropertyItemData>
<ObjectList>
<ObjectData><z>1</z></ObjectData>
<ObjectData><z>1</z></ObjectData>
</ObjectList>
</PropertyItemData>
<PropertyItemData>
</PropertyItemData>
</a>
'
n <- 300 * 1000
doc2 <- paste(lapply(1:n, function(x) doc), collapse = '')
doc2 <- sprintf('<b>%s</b>', doc2)
Current approach:
library(XML)
xx <- xmlParse(doc2)
b <- getNodeSet(xx, "//PropertyItemData") # get all PropertyItemData
s2 <- sapply(b, xpathSApply, ".//ObjectList", xmlSize) # for each count ObjectList sub-nodes
s2[lengths(s2) == 0L] <- 0L # if no subnodes = 0
s2 <- unlist(s2)
head(s2)
# [1] 1 2 0 1 2 0
Is there faster way to obtain desired results? Can this be done with XPath?
Upvotes: 4
Views: 266
Reputation: 3650
ne2_sol
seems even faster for this specific data:
system.time(v1 <- old_sol()) # 125.35
system.time(v3 <- new_sol()) # 28.91
system.time(v2 <- ne2_sol()) # 18.98
where:
old_sol <- function(){
xx <- xmlParse(doc2)
b <- getNodeSet(xx, "//PropertyItemData")
test <- sapply(b, xpathSApply, ".//ObjectList", xmlSize)
test[lengths(test) == 0L] <- 0L
test <- unlist(test)
}
new_sol <- function(){
xx2 <- read_xml(doc2)
b2 <- xml_find_all(xx2, "//PropertyItemData")
sapply(b2, function(y) xml_length(xml_children(y)))
}
ne2_sol <- function(){
xx2 <- read_xml(doc2)
b2 <- xml_find_all(xx2, "//PropertyItemData")
s3 <- xml_find_all(b2, './/ObjectData', flatten = F)
s3 <- lengths(s3)
}
Upvotes: 0
Reputation: 5958
This alternative delivers a time improvement of over 80% with your data.
library(xml2)
n <- 300 * 1000
doc2 <- paste(lapply(1:n, function(x) doc), collapse = '')
doc2 <- sprintf('<b>%s</b>', doc2)
xx2 <- read_xml(doc2)
b2 <- xml_find_all(xx2, "//PropertyItemData")
result <- sapply(b2, function(y) xml_length(xml_children(y)))
identical(result, s2)
# [1] TRUE
Benchmarking
original_sol <- function(){
xx <- xmlParse(doc2)
b <- getNodeSet(xx, "//PropertyItemData")
s2 <- sapply(b, xpathSApply, ".//ObjectList", xmlSize)
s2[lengths(s2) == 0L] <- 0L
s2 <- unlist(s2)
}
new_sol <- function(){
xx2 <- read_xml(doc2)
b2 <- xml_find_all(xx2, "//PropertyItemData")
result <- sapply(b2, function(y) xml_length(xml_children(y)))
}
library(microbenchmark)
microbenchmark(
original_solution = original_sol(),
new_solution = new_sol(),
times=1
)
Unit: seconds
expr min lq mean median uq max neval
original_solution 120.47773 120.47773 120.47773 120.47773 120.47773 120.47773 1
new_solution 25.35973 25.35973 25.35973 25.35973 25.35973 25.35973 1
EDIT if you need more specific methods than counting all children of PropertyItemData
nodes, here are 2 other methods:
ObjectList
nodes),ObjectData
nodes.# same method as OP's: length of children of ObjectList nodes -------------
objectlist <- lapply(b2, function(x) xml_find_all(x, "ObjectList"))
result_same_algorithm <- sapply(objectlist, function(x) sum(xml_length(xml_children(x))))
identical(result_same_algorithm, s2)
# third possibility: length of Object data children nodes --------------------------
res_objectdata_children <- sapply(b2, function(x) sum(xml_length(xml_find_all(x, ".//ObjectData"))))
identical(res_objectdata_children, s2)
Upvotes: 4