Reputation: 2890
I have a list of named vectors. I am trying to sum their values. But some of the names within a vector have reversed equivalents. For example, if I have some data that looks like this:
myList <- list(`1` = c('x1:x2' = 2, 'x2:x1' = 1, 'x3:x4' = 1),
`2` = c('x1:x2' = 3, 'x6:x1' = 2, 'x1:x1' = 1, 'x4:x3' = 1),
`3` = c('x3:x4' = 2, 'x1:x2' = 1, 'x4:x3' = 4),
`4` = c('x5:x2' = 1, 'x2:x5' = 1)
)
> myList
$`1`
x1:x2 x2:x1 x3:x4
2 1 1
$`2`
x1:x2 x6:x1 x1:x1 x4:x3
3 2 1 1
$`3`
x3:x4 x1:x2 x4:x3
2 1 4
$`4`
x5:x2 x2:x5
1 1
Here, we can see that in myList[[1]]
we have x1:x2 = 2
and x2:x1 = 1
. As these are the reverse of each other, they are equivalent, so, essentially, x1:x2 = 3
.
I am trying to sum the values for each named element (including the reverse) over each list element.
My desired output would look something like this:
var count listNo
1 x1:x2 3 1
2 x3:x4 1 1
3 x1:x2 3 2
4 x6:x1 2 2
5 x1:x1 1 2
6 x4:x3 1 2
7 x3:x4 6 3
8 x1:x2 1 3
9 x5:x2 2 4
Upvotes: 24
Views: 1288
Reputation: 7534
Base R. Similar but different.
l = unlist(myList)
l = data.frame(strsplit(names(l), "\\.") |>
do.call(what="rbind"), unname(l)) |>
transform(X2=vapply(strsplit(X2, "\\:"), \(i)
paste(sort(i), collapse=":"), character(1))) |>
setNames(c("listNo", "var", "count")) |>
aggregate(count~var+listNo, x=_, sum)
> l
var listNo count
1 x1:x2 1 3
2 x3:x4 1 1
3 x1:x1 2 1
4 x1:x2 2 3
5 x1:x6 2 2
6 x3:x4 2 1
7 x1:x2 3 1
8 x3:x4 3 6
9 x2:x5 4 2
Upvotes: 2
Reputation: 8506
Just for fun, here is a data.table
solution. Would be interesting to see how well the different approaches scale with larger lists.
library(data.table)
rowSort <- \(a) matrix(a[order(row(a), a)], nrow=dim(a)[1], byrow=TRUE, dimnames=dimnames(a))
res <- rbindlist(lapply(myList, \(count) as.data.table(count, keep.rownames="var")), idcol="listNo")
res[, c("p1", "p2"):= tstrsplit(var, ":")]
res[, var:=apply(rowSort(as.matrix(res[,.(p1,p2)])), 1, paste, collapse=":")][,
.(count=sum(count)), .(listNo, var)]
#> listNo var count
#> 1: 1 x1:x2 3
#> 2: 1 x3:x4 1
#> 3: 2 x1:x2 3
#> 4: 2 x1:x6 2
#> 5: 2 x1:x1 1
#> 6: 2 x3:x4 1
#> 7: 3 x3:x4 6
#> 8: 3 x1:x2 1
#> 9: 4 x2:x5 2
For the provided (small) example list, this modification seems to be slightly faster (and narrowly beats the others on my machine):
tsrt <- \(x) vapply(strsplit(x, ":"), \(y) paste0(sort(y), collapse=":"), FUN.VALUE = NA_character_)
res <- rbindlist(lapply(myList, \(count) as.data.table(count, keep.rownames="var")), idcol="listNo")
res[, var:= tsrt(var)][, .(count=sum(count)), .(listNo, var)]
Upvotes: 1
Reputation: 19107
Here is a base R approach mixed with dplyr::bind_rows()
:
tmp <- lapply(1:length(myList), function(i) {
tapply(setNames(myList[[i]],
sapply(strsplit(names(myList[[i]]), ":"),
function(x) paste0(sort(x), collapse = ":"))),
sapply(strsplit(names(myList[[i]]), ":"),
function(x) paste0(sort(x), collapse = ":")), sum)
})
bind_rows(tmp, .id = "listNo") |>
pivot_longer(!listNo, names_to = "var", values_to = "count", values_drop_na = T)
# A tibble: 9 x 3
listNo var count
<chr> <chr> <dbl>
1 1 x1:x2 3
2 1 x3:x4 1
3 2 x1:x2 3
4 2 x3:x4 1
5 2 x1:x1 1
6 2 x1:x6 2
7 3 x1:x2 1
8 3 x3:x4 6
9 4 x2:x5 2
Out of curiosity, I've run microbenchmark
on existing answers, and it seems like @ThomasIsCoding's solution has beaten @AllanCameron to be the best in terms of time:
microbenchmark::microbenchmark(
Allan = {
`row.names<-`(do.call(rbind, Map(function(vec, name) {
x <- names(vec)
l <- sapply(strsplit(x, ":"), function(y) {
paste0("x", sort(as.numeric(sub("\\D", "", y))), collapse = ":")
})
df <- setNames(as.data.frame(table(rep(l, vec))), c("var", "count"))
df$listNo <- name
df
}, vec = myList, name = names(myList))), NULL)
},
benson23 = {
tmp <- lapply(1:length(myList), function(i) {
tapply(setNames(myList[[i]],
sapply(strsplit(names(myList[[i]]), ":"),
function(x) paste0(sort(x), collapse = ":"))),
sapply(strsplit(names(myList[[i]]), ":"),
function(x) paste0(sort(x), collapse = ":")), sum)
})
bind_rows(tmp, .id = "listNo") |>
pivot_longer(!listNo, names_to = "var", values_to = "count", values_drop_na = T)
},
tmfmnk = {
map_dfr(myList, enframe, .id = "listNo") %>%
mutate(var = map_chr(str_split(name, ":"), ~ str_c(sort(.), collapse = ":"))) %>%
group_by(listNo, var) %>%
summarise(count = sum(value))
},
zephryl = {
tibble(count = myList, listNo = names(myList)) %>%
unnest_longer(count, indices_to = "var") %>%
mutate(
var = str_extract_all(var, "\\d+"),
var = map_chr(var, ~ str_glue("x{sort(.x)[[1]]}:x{sort(.x)[[2]]}"))
) %>%
group_by(listNo, var) %>%
summarize(count = sum(count), .groups = "drop")
},
PaulS = {
map_dfr(myList, identity, .id = "listNo") %>%
pivot_longer(cols = -listNo, values_drop_na = T) %>%
rowwise %>%
mutate(name = str_split(name, ":", simplify = T) %>% sort %>%
str_c(collapse = ":")) %>%
group_by(name, listNo) %>%
summarise(count = sum(value), .groups = "drop")
},
TIC1 = {
aggregate(
count ~ .,
transform(
cbind(
setNames(do.call(rbind, Map(stack, myList)), c("count", "var")),
listNo = rep(seq_along(myList), lengths(myList))
),
var = sapply(
strsplit(as.character(var), ":"),
function(x) paste0(sort(x), collapse = ":")
)
),
sum
)
},
TIC2 = {
aggregate(
count ~ .,
cbind(
var = unlist(sapply(
myList,
function(x) {
sapply(
strsplit(names(x), ":"),
function(v) paste0(sort(v), collapse = ":")
)
}
)),
setNames(stack(myList), c("count", "listNo"))
),
sum
)
},
Maël = {
myList %>%
imap(~ .x %>%
enframe() %>%
separate(name, into = c("c1", "c2")) %>%
graph.data.frame(., directed = F) %>%
get.data.frame() %>%
group_by(from, to) %>%
summarise(count = sum(value)) %>%
unite(c("from","to"), col = "var", sep = ":") %>%
mutate(listNo = .y)) %>%
bind_rows()
})
Unit: milliseconds
expr min lq mean median uq max neval cld
Allan 2.1327 2.25920 2.475978 2.33445 2.45270 12.3697 100 a
benson23 3.5083 3.80855 4.150929 4.03700 4.27685 13.3313 100 a
tmfmnk 5.4928 5.88520 6.324940 6.24190 6.66975 8.1777 100 ab
zephryl 10.1629 10.89110 14.813878 11.58475 12.14085 221.0931 100 c
PaulS 7.7565 8.44360 11.402325 9.10860 9.47480 124.1965 100 bc
TIC1 3.5233 3.88805 8.240207 4.06640 4.26765 202.9082 100 a c
TIC2 1.8722 2.03240 2.247993 2.13230 2.24045 10.7320 100 a
Maël 35.3066 39.52920 44.456091 40.96870 42.39480 170.8322 100 d
Upvotes: 11
Reputation: 52089
Using the igraph
library, you can create an undirected graph and sum the values. Even though a bit longer, this solution uses functions that I think are easier to understand.
library(tidyverse)
library(igraph)
myList %>%
imap(~ .x %>%
enframe() %>%
separate(name, into = c("c1", "c2")) %>%
graph.data.frame(., directed = F) %>%
get.data.frame() %>%
group_by(from, to) %>%
summarise(count = sum(value)) %>%
unite(c("from","to"), col = "var", sep = ":") %>%
mutate(listNo = .y)) %>%
bind_rows()
output
# A tibble: 9 x 3
var count listNo
<chr> <dbl> <chr>
1 x1:x2 3 1
2 x3:x4 1 1
3 x1:x1 1 2
4 x1:x2 3 2
5 x1:x6 2 2
6 x4:x3 1 2
7 x1:x2 1 3
8 x3:x4 6 3
9 x5:x2 2 4
Upvotes: 1
Reputation: 101848
Two base R options using aggregate
+ stack
+ strsplit
TIC1 <- function() {
aggregate(
count ~ .,
transform(
cbind(
setNames(do.call(rbind, Map(stack, myList)), c("count", "var")),
listNo = rep(seq_along(myList), lengths(myList))
),
var = sapply(
strsplit(as.character(var), ":"),
function(x) paste0(sort(x), collapse = ":")
)
),
sum
)
}
or
TIC2 <- function() {
aggregate(
count ~ .,
cbind(
var = unlist(sapply(
myList,
function(x) {
sapply(
strsplit(names(x), ":"),
function(v) paste0(sort(v), collapse = ":")
)
}
)),
setNames(stack(myList), c("count", "listNo"))
),
sum
)
}
give
> TIC1()
var listNo count
1 x1:x2 1 3
2 x3:x4 1 1
3 x1:x1 2 1
4 x1:x2 2 3
5 x1:x6 2 2
6 x3:x4 2 1
7 x1:x2 3 1
8 x3:x4 3 6
9 x2:x5 4 2
> TIC2()
var listNo count
1 x1:x2 1 3
2 x3:x4 1 1
3 x1:x1 2 1
4 x1:x2 2 3
5 x1:x6 2 2
6 x3:x4 2 1
7 x1:x2 3 1
8 x3:x4 3 6
9 x2:x5 4 2
microbenchmark(
TIC1(),
TIC2(),
Allan()
)
shows
Unit: milliseconds
expr min lq mean median uq max neval
TIC1() 4.2614 4.43265 4.902491 4.68145 4.89585 13.2152 100
TIC2() 2.2116 2.29665 2.707671 2.51175 2.63690 10.3980 100
Allan() 2.4817 2.59040 3.006702 2.71535 2.91005 16.6410 100
Upvotes: 4
Reputation: 25333
Another possible solution, tidyverse
-based:
library(tidyverse)
map_dfr(myList, identity, .id = "listNo") %>%
pivot_longer(cols = -listNo, values_drop_na = T) %>%
rowwise %>%
mutate(name = str_split(name, ":", simplify = T) %>% sort %>%
str_c(collapse = ":")) %>%
group_by(name, listNo) %>%
summarise(count = sum(value), .groups = "drop")
#> # A tibble: 9 × 3
#> name listNo count
#> <chr> <chr> <dbl>
#> 1 x1:x1 2 1
#> 2 x1:x2 1 3
#> 3 x1:x2 2 3
#> 4 x1:x2 3 1
#> 5 x1:x6 2 2
#> 6 x2:x5 4 2
#> 7 x3:x4 1 1
#> 8 x3:x4 2 1
#> 9 x3:x4 3 6
Upvotes: 8
Reputation: 39868
Another tidyverse
option could be:
map_dfr(myList, enframe, .id = "listNo") %>%
mutate(var = map_chr(str_split(name, ":"), ~ str_c(sort(.), collapse = ":"))) %>%
group_by(listNo, var) %>%
summarise(count = sum(value))
listNo var count
<chr> <chr> <dbl>
1 1 x1:x2 3
2 1 x3:x4 1
3 2 x1:x1 1
4 2 x1:x2 3
5 2 x1:x6 2
6 2 x3:x4 1
7 3 x1:x2 1
8 3 x3:x4 6
9 4 x2:x5 2
Upvotes: 11
Reputation: 17184
A {tidyverse} solution:
library(tidyverse)
tibble(count = myList, listNo = names(myList)) %>%
unnest_longer(count, indices_to = "var") %>%
mutate(
var = str_extract_all(var, "\\d+"),
var = map_chr(var, ~ str_glue("x{sort(.x)[[1]]}:x{sort(.x)[[2]]}"))
) %>%
group_by(listNo, var) %>%
summarize(count = sum(count), .groups = "drop")
# # A tibble: 9 x 3
# listNo var count
# <chr> <chr> <dbl>
# 1 1 x1:x2 3
# 2 1 x3:x4 1
# 3 2 x1:x1 1
# 4 2 x1:x2 3
# 5 2 x1:x6 2
# 6 2 x3:x4 1
# 7 3 x1:x2 1
# 8 3 x3:x4 6
# 9 4 x2:x5 2
Upvotes: 8
Reputation: 174008
This is tricky. I'd be interested to see a more elegant solution
`row.names<-`(do.call(rbind, Map(function(vec, name) {
x <- names(vec)
l <- sapply(strsplit(x, ":"), function(y) {
paste0("x", sort(as.numeric(sub("\\D", "", y))), collapse = ":")
})
df <- setNames(as.data.frame(table(rep(l, vec))), c("var", "count"))
df$listNo <- name
df
}, vec = myList, name = names(myList))), NULL)
#> var count listNo
#> 1 x1:x2 3 1
#> 2 x3:x4 1 1
#> 3 x1:x1 1 2
#> 4 x1:x2 3 2
#> 5 x1:x6 2 2
#> 6 x3:x4 1 2
#> 7 x1:x2 1 3
#> 8 x3:x4 6 3
#> 9 x2:x5 2 4
Created on 2022-03-06 by the reprex package (v2.0.1)
Upvotes: 12