Electrino
Electrino

Reputation: 2890

Sum named vector values where the names are reversed in R

I have a list of named vectors. I am trying to sum their values. But some of the names within a vector have reversed equivalents. For example, if I have some data that looks like this:

myList <- list(`1` = c('x1:x2' = 2, 'x2:x1' = 1, 'x3:x4' = 1),
               `2` = c('x1:x2' = 3, 'x6:x1' = 2, 'x1:x1' = 1, 'x4:x3' = 1),
               `3` = c('x3:x4' = 2, 'x1:x2' = 1, 'x4:x3' = 4),
               `4` = c('x5:x2' = 1, 'x2:x5' = 1)
               )
> myList
$`1`
x1:x2 x2:x1 x3:x4
    2     1     1

$`2`
x1:x2 x6:x1 x1:x1 x4:x3
    3     2     1     1

$`3`
x3:x4 x1:x2 x4:x3
    2     1     4

$`4`
x5:x2 x2:x5
    1     1

Here, we can see that in myList[[1]] we have x1:x2 = 2 and x2:x1 = 1. As these are the reverse of each other, they are equivalent, so, essentially, x1:x2 = 3.

I am trying to sum the values for each named element (including the reverse) over each list element.

My desired output would look something like this:

    var count listNo
1 x1:x2     3      1
2 x3:x4     1      1
3 x1:x2     3      2
4 x6:x1     2      2
5 x1:x1     1      2
6 x4:x3     1      2
7 x3:x4     6      3
8 x1:x2     1      3
9 x5:x2     2      4

Upvotes: 24

Views: 1288

Answers (9)

Friede
Friede

Reputation: 7534

Base R. Similar but different.

l = unlist(myList)
l = data.frame(strsplit(names(l), "\\.") |> 
                 do.call(what="rbind"), unname(l)) |>
  transform(X2=vapply(strsplit(X2, "\\:"), \(i) 
                      paste(sort(i), collapse=":"), character(1))) |>
  setNames(c("listNo", "var", "count")) |>
  aggregate(count~var+listNo, x=_, sum)
> l
    var listNo count
1 x1:x2      1     3
2 x3:x4      1     1
3 x1:x1      2     1
4 x1:x2      2     3
5 x1:x6      2     2
6 x3:x4      2     1
7 x1:x2      3     1
8 x3:x4      3     6
9 x2:x5      4     2

Upvotes: 2

user12728748
user12728748

Reputation: 8506

Just for fun, here is a data.table solution. Would be interesting to see how well the different approaches scale with larger lists.

library(data.table)
rowSort <- \(a) matrix(a[order(row(a), a)], nrow=dim(a)[1], byrow=TRUE, dimnames=dimnames(a))
res <- rbindlist(lapply(myList, \(count) as.data.table(count, keep.rownames="var")), idcol="listNo")
res[, c("p1", "p2"):= tstrsplit(var, ":")]
res[, var:=apply(rowSort(as.matrix(res[,.(p1,p2)])), 1, paste, collapse=":")][,
    .(count=sum(count)), .(listNo, var)]
#>    listNo   var count
#> 1:      1 x1:x2     3
#> 2:      1 x3:x4     1
#> 3:      2 x1:x2     3
#> 4:      2 x1:x6     2
#> 5:      2 x1:x1     1
#> 6:      2 x3:x4     1
#> 7:      3 x3:x4     6
#> 8:      3 x1:x2     1
#> 9:      4 x2:x5     2

For the provided (small) example list, this modification seems to be slightly faster (and narrowly beats the others on my machine):

tsrt <- \(x) vapply(strsplit(x, ":"), \(y) paste0(sort(y), collapse=":"), FUN.VALUE = NA_character_)
    res <- rbindlist(lapply(myList, \(count) as.data.table(count, keep.rownames="var")), idcol="listNo")
    res[, var:= tsrt(var)][, .(count=sum(count)), .(listNo, var)]

Upvotes: 1

benson23
benson23

Reputation: 19107

Solution

Here is a base R approach mixed with dplyr::bind_rows():

tmp <- lapply(1:length(myList), function(i) {
  tapply(setNames(myList[[i]], 
                  sapply(strsplit(names(myList[[i]]), ":"), 
                         function(x) paste0(sort(x), collapse = ":"))), 
         sapply(strsplit(names(myList[[i]]), ":"), 
                function(x) paste0(sort(x), collapse = ":")), sum)
})

bind_rows(tmp, .id = "listNo") |> 
  pivot_longer(!listNo, names_to = "var", values_to = "count", values_drop_na = T)

# A tibble: 9 x 3
  listNo var   count
  <chr>  <chr> <dbl>
1 1      x1:x2     3
2 1      x3:x4     1
3 2      x1:x2     3
4 2      x3:x4     1
5 2      x1:x1     1
6 2      x1:x6     2
7 3      x1:x2     1
8 3      x3:x4     6
9 4      x2:x5     2

microbenchmark

Out of curiosity, I've run microbenchmark on existing answers, and it seems like @ThomasIsCoding's solution has beaten @AllanCameron to be the best in terms of time:

microbenchmark::microbenchmark(
  Allan = {
    `row.names<-`(do.call(rbind, Map(function(vec, name) {  
      x <- names(vec)
      l <- sapply(strsplit(x, ":"), function(y) {
        paste0("x", sort(as.numeric(sub("\\D", "", y))), collapse = ":")
      })
      df <- setNames(as.data.frame(table(rep(l, vec))), c("var", "count"))
      df$listNo <- name
      df
    }, vec = myList, name = names(myList))), NULL)
    },
  benson23 = {
    tmp <- lapply(1:length(myList), function(i) {
      tapply(setNames(myList[[i]], 
                      sapply(strsplit(names(myList[[i]]), ":"), 
                             function(x) paste0(sort(x), collapse = ":"))), 
             sapply(strsplit(names(myList[[i]]), ":"), 
                    function(x) paste0(sort(x), collapse = ":")), sum)
    })
    
    bind_rows(tmp, .id = "listNo") |> 
      pivot_longer(!listNo, names_to = "var", values_to = "count", values_drop_na = T)
    
  },
  tmfmnk = {
    map_dfr(myList, enframe, .id = "listNo") %>%
      mutate(var = map_chr(str_split(name, ":"), ~ str_c(sort(.), collapse = ":"))) %>%
      group_by(listNo, var) %>%
      summarise(count = sum(value))
  },
  zephryl = {
    tibble(count = myList, listNo = names(myList)) %>%
      unnest_longer(count, indices_to = "var") %>% 
      mutate(
        var = str_extract_all(var, "\\d+"),
        var = map_chr(var, ~ str_glue("x{sort(.x)[[1]]}:x{sort(.x)[[2]]}"))
      ) %>% 
      group_by(listNo, var) %>%
      summarize(count = sum(count), .groups = "drop")
  },
  PaulS = {
    map_dfr(myList, identity, .id = "listNo") %>%
      pivot_longer(cols = -listNo, values_drop_na = T) %>% 
      rowwise %>%
      mutate(name = str_split(name, ":", simplify = T) %>% sort %>% 
               str_c(collapse = ":")) %>% 
      group_by(name, listNo) %>% 
      summarise(count = sum(value), .groups = "drop") 
  },
  TIC1 = {
    aggregate(
      count ~ .,
      transform(
        cbind(
          setNames(do.call(rbind, Map(stack, myList)), c("count", "var")),
          listNo = rep(seq_along(myList), lengths(myList))
        ),
        var = sapply(
          strsplit(as.character(var), ":"),
          function(x) paste0(sort(x), collapse = ":")
        )
      ),
      sum
    )
  },
  TIC2 = {
    aggregate(
      count ~ .,
      cbind(
        var = unlist(sapply(
          myList,
          function(x) {
            sapply(
              strsplit(names(x), ":"),
              function(v) paste0(sort(v), collapse = ":")
            )
          }
        )),
        setNames(stack(myList), c("count", "listNo"))
      ),
      sum
    )
  },
  Maël = {
    myList %>% 
      imap(~ .x %>% 
             enframe() %>% 
             separate(name, into = c("c1", "c2")) %>% 
             graph.data.frame(., directed = F) %>% 
             get.data.frame() %>% 
             group_by(from, to) %>% 
             summarise(count = sum(value)) %>% 
             unite(c("from","to"), col = "var", sep = ":") %>% 
             mutate(listNo = .y)) %>%
      bind_rows()
  })

Unit: milliseconds
     expr     min       lq      mean   median       uq      max neval  cld
    Allan  2.1327  2.25920  2.475978  2.33445  2.45270  12.3697   100 a   
 benson23  3.5083  3.80855  4.150929  4.03700  4.27685  13.3313   100 a   
   tmfmnk  5.4928  5.88520  6.324940  6.24190  6.66975   8.1777   100 ab  
  zephryl 10.1629 10.89110 14.813878 11.58475 12.14085 221.0931   100   c 
    PaulS  7.7565  8.44360 11.402325  9.10860  9.47480 124.1965   100  bc 
     TIC1  3.5233  3.88805  8.240207  4.06640  4.26765 202.9082   100 a c 
     TIC2  1.8722  2.03240  2.247993  2.13230  2.24045  10.7320   100 a   
     Maël 35.3066 39.52920 44.456091 40.96870 42.39480 170.8322   100    d

Upvotes: 11

Ma&#235;l
Ma&#235;l

Reputation: 52089

Using the igraph library, you can create an undirected graph and sum the values. Even though a bit longer, this solution uses functions that I think are easier to understand.

library(tidyverse)
library(igraph)

myList %>% 
  imap(~ .x %>% 
        enframe() %>% 
        separate(name, into = c("c1", "c2")) %>% 
        graph.data.frame(., directed = F) %>% 
        get.data.frame() %>% 
        group_by(from, to) %>% 
        summarise(count = sum(value)) %>% 
        unite(c("from","to"), col = "var", sep = ":") %>% 
        mutate(listNo = .y)) %>%
  bind_rows()

output

# A tibble: 9 x 3
  var   count listNo
  <chr> <dbl> <chr> 
1 x1:x2     3 1     
2 x3:x4     1 1     
3 x1:x1     1 2     
4 x1:x2     3 2     
5 x1:x6     2 2     
6 x4:x3     1 2     
7 x1:x2     1 3     
8 x3:x4     6 3     
9 x5:x2     2 4     

Upvotes: 1

ThomasIsCoding
ThomasIsCoding

Reputation: 101848

Two base R options using aggregate + stack + strsplit

TIC1 <- function() {
  aggregate(
    count ~ .,
    transform(
      cbind(
        setNames(do.call(rbind, Map(stack, myList)), c("count", "var")),
        listNo = rep(seq_along(myList), lengths(myList))
      ),
      var = sapply(
        strsplit(as.character(var), ":"),
        function(x) paste0(sort(x), collapse = ":")
      )
    ),
    sum
  )
}

or

TIC2 <- function() {
  aggregate(
    count ~ .,
    cbind(
      var = unlist(sapply(
        myList,
        function(x) {
          sapply(
            strsplit(names(x), ":"),
            function(v) paste0(sort(v), collapse = ":")
          )
        }
      )),
      setNames(stack(myList), c("count", "listNo"))
    ),
    sum
  )
}

give

> TIC1()
    var listNo count
1 x1:x2      1     3
2 x3:x4      1     1
3 x1:x1      2     1
4 x1:x2      2     3
5 x1:x6      2     2
6 x3:x4      2     1
7 x1:x2      3     1
8 x3:x4      3     6
9 x2:x5      4     2

> TIC2()
    var listNo count
1 x1:x2      1     3
2 x3:x4      1     1
3 x1:x1      2     1
4 x1:x2      2     3
5 x1:x6      2     2
6 x3:x4      2     1
7 x1:x2      3     1
8 x3:x4      3     6
9 x2:x5      4     2

Benchmark

microbenchmark(
  TIC1(),
  TIC2(),
  Allan()
)

shows

Unit: milliseconds
    expr    min      lq     mean  median      uq     max neval
  TIC1() 4.2614 4.43265 4.902491 4.68145 4.89585 13.2152   100
  TIC2() 2.2116 2.29665 2.707671 2.51175 2.63690 10.3980   100
 Allan() 2.4817 2.59040 3.006702 2.71535 2.91005 16.6410   100

Upvotes: 4

PaulS
PaulS

Reputation: 25333

Another possible solution, tidyverse-based:

library(tidyverse)

map_dfr(myList, identity, .id = "listNo") %>%
  pivot_longer(cols = -listNo, values_drop_na = T) %>% 
  rowwise %>%
  mutate(name = str_split(name, ":", simplify = T) %>% sort %>% 
         str_c(collapse = ":")) %>% 
  group_by(name, listNo) %>% 
  summarise(count = sum(value), .groups = "drop") 

#> # A tibble: 9 × 3
#>   name  listNo count
#>   <chr> <chr>  <dbl>
#> 1 x1:x1 2          1
#> 2 x1:x2 1          3
#> 3 x1:x2 2          3
#> 4 x1:x2 3          1
#> 5 x1:x6 2          2
#> 6 x2:x5 4          2
#> 7 x3:x4 1          1
#> 8 x3:x4 2          1
#> 9 x3:x4 3          6

Upvotes: 8

tmfmnk
tmfmnk

Reputation: 39868

Another tidyverse option could be:

map_dfr(myList, enframe, .id = "listNo") %>%
    mutate(var = map_chr(str_split(name, ":"), ~ str_c(sort(.), collapse = ":"))) %>%
    group_by(listNo, var) %>%
    summarise(count = sum(value))

  listNo var   count
  <chr>  <chr> <dbl>
1 1      x1:x2     3
2 1      x3:x4     1
3 2      x1:x1     1
4 2      x1:x2     3
5 2      x1:x6     2
6 2      x3:x4     1
7 3      x1:x2     1
8 3      x3:x4     6
9 4      x2:x5     2

Upvotes: 11

zephryl
zephryl

Reputation: 17184

A {tidyverse} solution:

library(tidyverse)
               
tibble(count = myList, listNo = names(myList)) %>%
  unnest_longer(count, indices_to = "var") %>% 
  mutate(
    var = str_extract_all(var, "\\d+"),
    var = map_chr(var, ~ str_glue("x{sort(.x)[[1]]}:x{sort(.x)[[2]]}"))
  ) %>% 
  group_by(listNo, var) %>%
  summarize(count = sum(count), .groups = "drop")

# # A tibble: 9 x 3
#   listNo var   count
#   <chr>  <chr> <dbl>
# 1 1      x1:x2     3
# 2 1      x3:x4     1
# 3 2      x1:x1     1
# 4 2      x1:x2     3
# 5 2      x1:x6     2
# 6 2      x3:x4     1
# 7 3      x1:x2     1
# 8 3      x3:x4     6
# 9 4      x2:x5     2

Upvotes: 8

Allan Cameron
Allan Cameron

Reputation: 174008

This is tricky. I'd be interested to see a more elegant solution

`row.names<-`(do.call(rbind, Map(function(vec, name) {  
    x <- names(vec)
    l <- sapply(strsplit(x, ":"), function(y) {
      paste0("x", sort(as.numeric(sub("\\D", "", y))), collapse = ":")
      })
    df <- setNames(as.data.frame(table(rep(l, vec))), c("var", "count"))
    df$listNo <- name
    df
  }, vec = myList, name = names(myList))), NULL)

#>     var count listNo
#> 1 x1:x2     3      1
#> 2 x3:x4     1      1
#> 3 x1:x1     1      2
#> 4 x1:x2     3      2
#> 5 x1:x6     2      2
#> 6 x3:x4     1      2
#> 7 x1:x2     1      3
#> 8 x3:x4     6      3
#> 9 x2:x5     2      4

Created on 2022-03-06 by the reprex package (v2.0.1)

Upvotes: 12

Related Questions