Reputation: 1007
Here's a simple taxonomy (labels and IDs):
test_data <- data.frame(
cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
cat_h4 = c("Big", "Little", rep(NA, 6)))
The parsed structure should match the following:
list(
Animals = list(Mammals = list(Dogs = list(Big = 661, Little = 197), Other = 228),
Birds = list(Hawks = 650, Other = 126)),
Plants = list(Wheat = 912, Grass = 949, Other = 428))
Upvotes: 3
Views: 489
Reputation: 4024
I would avoid list structures in preference to tidy data. Here is a way to reduce the redundancy in the data.
library(dplyr)
h1_h2 =
test_data %>%
select(cat_h1, cat_h2) %>%
distinct %>%
filter(cat_h2 %>% is.na %>% `!`)
h2_h3 =
test_data %>%
select(cat_h2, cat_h3) %>%
distinct %>%
filter(cat_h3 %>% is.na %>% `!`)
h3_h4 =
test_data %>%
select(cat_h3, cat_h4) %>%
distinct %>%
filter(cat_h4 %>% is.na %>% `!`)
The original can be easily reconstituted:
h1_h2 %>%
left_join(h2_h3) %>%
left_join(h3_h4)
Edit: And here's a way to automate the whole process.
library(dplyr)
library(lazyeval)
adjacency = function(data) {
adjacency_table = function(data, larger_name, smaller_name)
lazy(data %>%
select(larger_name, smaller_name) %>%
distinct %>%
filter(smaller_name %>% is.na %>% `!`) ) %>%
interp(larger_name = larger_name %>% as.name,
smaller_name = smaller_name %>% as.name) %>%
lazy_eval %>%
setNames(c("larger", "smaller"))
data_frame(smaller_name = data %>% names) %>%
mutate(larger_name = smaller_name %>% lag) %>%
slice(-1) %>%
group_by(larger_name, smaller_name) %>%
do(adjacency_table(data, .$larger_name, .$smaller_name) )
}
result =
test_data %>%
select(-cat_id) %>%
adjacency
Upvotes: 1
Reputation: 9696
If you are OK with the order changing slightly, this is a recursive solution that processes by column:
f <- function(x, d=cbind(x,NA)) {
c(
# call f by branch
if(ncol(d) > 3) local({
x <- d[!is.na(d[[3]]),]
by( x[-2], droplevels(x[2]), f, x=NA, simplify=FALSE)
}),
# leaf nodes
setNames(as.list(d[[1]]), d[[2]])[is.na(d[[3]])]
)
}
which will give this:
> str(f(test_data))
List of 2
$ Animals:List of 2
..$ Birds :List of 2
.. ..$ Hawks: num 650
.. ..$ Other: num 126
..$ Mammals:List of 2
.. ..$ Dogs :List of 2
.. .. ..$ Big : num 661
.. .. ..$ Little: num 197
.. ..$ Other: num 228
$ Plants :List of 3
..$ Wheat: num 912
..$ Grass: num 949
..$ Other: num 428
Upvotes: 7
Reputation: 226881
Maybe not the most efficient, but not too hard:
Create data:
test_data <- data.frame(
cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
cat_h4 = c("Big", "Little", rep(NA, 6)))
Loop through the data frame and build the list/tree:
tax <- list() ## initialize
for (i in 1:nrow(test_data)) {
## convert data.frame row to character vector
taxdat <- sapply(test_data[i,-1],as.character)
taxstr <- character(0) ## initialize taxon string
ntax <- length(na.omit(taxdat))
for (j in 1:ntax) {
taxstr <- c(taxstr,taxdat[j]) ## build string
if (is.null(tax[[taxstr]])) {
tax[[taxstr]] <- list() ## initialize if necessary
}
}
tax[[taxstr]] <- test_data$cat_id[i] ## assign value to tip
}
Compare result to desired:
res <- list(
Animals = list(Mammals = list(Dogs = list(Big = 661, Little = 197),
Other = 228),
Birds = list(Hawks = 650, Other = 126)),
Plants = list(Wheat = 912, Grass = 949, Other = 428))
all.equal(res,tax) ## TRUE
Upvotes: 3