Reputation: 327
Is there a way to parse strings representing groups into a hierarchical structure in R?
Say I have groups structured as follows:
"1", "1.1", "1.1.1", "1.1.1.1", "1.1.2", "1.1.3", "1.1.3.1", "1.1.3.2", "1.1.3.3", "1.2",
"1.2.1", "1.2.1.1", "1.2.1.2", "1.2.1.2.1", "1.2.2", "1.2.2.1", "1.2.2.2"
We can naturally see that the 'highest' level is '1', followed by the two main splits '1.1' and '1.2', and so on.
Can this be parsed in R into a hierarchical structure, and the 'levels' easily retrieved (e.g. as above - if I want the second-highest level then R returns '1.1' and '1.2')
Upvotes: 2
Views: 613
Reputation: 270010
We can convert x
shown in the Note at the end to an igraph g
and then igraph provides many operations on it.
library(igraph)
DF <- data.frame(parent = sub("\\.\\d+$", "", x), name = x, stringsAsFactors = FALSE)
DF <- subset(DF, parent != name) # remove loop in root to itself
g <- graph.data.frame(DF)
node <- "1.1"
setdiff(names(subcomponent(g, node, mode = "in")), node)
## [1] "1"
node <- "1.2"
setdiff(names(neighborhood(g, nodes = node, mode = "out")[[1]]), node)
## [1] "1.2.1" "1.2.2"
node <- "1.2"
setdiff(names(subcomponent(g, node, mode = "out")), node)
## [1] "1.2.1" "1.2.2" "1.2.1.2" "1.2.1.1" "1.2.2.1" "1.2.2.2"
## [7] "1.2.1.2.1"
dfs(g, root = "1", dist = TRUE)$dist
## 1 1.1 1.1.1 1.1.3 1.2 1.2.1 1.2.1.2 1.2.2
## 0 1 2 2 1 2 3 2
## 1.1.1.1 1.1.2 1.1.3.1 1.1.3.2 1.1.3.3 1.2.1.1 1.2.1.2.1 1.2.2.1
## 3 2 3 3 3 3 4 3
## 1.2.2.2
## 3
names(which(degree(g, mode = "out") == 0))
## [1] "1.1.1.1" "1.1.2" "1.1.3.1" "1.1.3.2" "1.1.3.3" "1.2.1.1"
## [7] "1.2.1.2.1" "1.2.2.1" "1.2.2.2"
plot(g, layout = layout_as_tree(g))
(continued after plot)
We can also use package rviewgraph:
library(rviewgraph)
rViewGraph(g)
Using x
from the Note at the end, we can apply ordinary string operations to it directly so it might not be necessary to transform the data at all.
parent <- "1"
pat <- sprintf("^%s\\.\\d+$", parent)
grep(pat, x, value = TRUE)
## [1] "1.1" "1.2"
depth <- 2
pat2 <- sprintf("^\\d+(\\.\\d+){%d}$", depth-1)
grep(pat2, x, value = TRUE)
[1] "1.1" "1.2"
or
depth <- 2
pat2 <- sprintf("^%s$", paste(rep("\\d+", depth), collapse = "\\."))
grep(pat2, x, value = TRUE)
## [1] "1.1" "1.2"
Each component lists all the nodes in of that depth.
split(x, nchar(gsub("\\d", "", x)) + 1)
## $`1`
## [1] "1"
##
## $`2`
## [1] "1.1" "1.2"
##
## $`3`
## [1] "1.1.1" "1.1.2" "1.1.3" "1.2.1" "1.2.2"
##
## $`4`
## [1] "1.1.1.1" "1.1.3.1" "1.1.3.2" "1.1.3.3" "1.2.1.1" "1.2.1.2" "1.2.2.1"
## [8] "1.2.2.2"
##
## $`5`
## [1] "1.2.1.2.1"
We can omit the line marked ## if it is ok to have the root node be its own parent.
node <- "1.1"
parent <- sub("\\.\\d+$", "", node)
parent <- setdiff(parent, node) ##
parent
## [1] "1"
Get children of parent and then remove input node from that result.
node <- "1.1"
parent <- sub("\\.\\d+$", "", node)
pat <- sprintf("^%s\\.\\d+$", parent)
setdiff(grep(pat, x, value = TRUE), node)
## [1] "1.2"
node <- "1.2"
x[startsWith(x, paste0(node, "."))]
## [1] "1.2.1" "1.2.1.1" "1.2.1.2" "1.2.1.2.1" "1.2.2" "1.2.2.1"
## [7] "1.2.2.2"
node <- "1.2.1"
x[startsWith(node, paste0(x, "."))]
## [1] "1" "1.2"
leaf <- x[sapply(x, function(st) sum(startsWith(x, st))) == 1]
leaf
## [1] "1.1.1.1" "1.1.2" "1.1.3.1" "1.1.3.2" "1.1.3.3" "1.2.1.1"
## [7] "1.2.1.2.1" "1.2.2.1" "1.2.2.2"
setdiff(x, leaf)
## [1] "1" "1.1" "1.1.1" "1.1.3" "1.2" "1.2.1" "1.2.1.2"
## [8] "1.2.2"
x <- c("1", "1.1", "1.1.1", "1.1.1.1", "1.1.2", "1.1.3", "1.1.3.1",
"1.1.3.2", "1.1.3.3", "1.2",
"1.2.1", "1.2.1.1", "1.2.1.2", "1.2.1.2.1", "1.2.2", "1.2.2.1", "1.2.2.2")
Upvotes: 2
Reputation: 24848
One option might be to use str_split
to identify the depth level.
library(stringr)
library(dplyr)
library(purrr)
strings <- c("1", "1.1", "1.1.1", "1.1.1.1", "1.1.2", "1.1.3", "1.1.3.1", "1.1.3.2", "1.1.3.3", "1.2","1.2.1", "1.2.1.1", "1.2.1.2", "1.2.1.2.1", "1.2.2", "1.2.2.1", "1.2.2.2")
strings %>%
strsplit("\\.") %>%
map(~set_names(.x,paste0("DepthLevel",seq_along(.x)))) %>%
bind_rows
## A tibble: 17 x 5
# DepthLevel1 DepthLevel2 DepthLevel3 DepthLevel4 DepthLevel5
# <chr> <chr> <chr> <chr> <chr>
# 1 1 NA NA NA NA
# 2 1 1 NA NA NA
# 3 1 1 1 NA NA
# 4 1 1 1 1 NA
# 5 1 1 2 NA NA
# 6 1 1 3 NA NA
# 7 1 1 3 1 NA
# 8 1 1 3 2 NA
# 9 1 1 3 3 NA
#10 1 2 NA NA NA
#11 1 2 1 NA NA
#12 1 2 1 1 NA
#13 1 2 1 2 NA
#14 1 2 1 2 1
#15 1 2 2 NA NA
#16 1 2 2 1 NA
#17 1 2 2 2 NA
Upvotes: 1