mehmety
mehmety

Reputation: 53

How to get the row means of values of dataframe elements in a list?

I have a list object in R called list_df with a length of 4 . Each 4 element contains one matrix, with one column and four rows and row names as characters. What I want to do is I would like to get means and upper and lower bounds of these means in confidence interval 95% for every row in dataframe that matches with the names of other rows in other dataframes or the same dataframe within the list. I want to put this information to a dataframe where there are the rownames are still the same, and there are three columns with mean, lower bound of CI and upper bound of CI. If a rowname occurs only in one dataframe, I would like the final mean output to be that value. Lets say if the row name is missing, I would like those na's to have no effect on the means. So if say row "blue" occurs only in element 1 and element 5 and I want to mean(blue) to be average of these two. So if the first matrix is row.names= a b c d with values 1 2 3 4 and second one is a b c c 1 ,2,3,3 I want the final df ouput to be row.names = a b c d values 1,2,3,4 Here is a reproducable example:

    df1 <- data.frame(
      var1 = c(1, 2, 3, 4),
      var2 = c(5, 6, 7, 8)
    )

m1 <- as.matrix(df1)

    rownames(m1) <-  c("red", "green", "black", "white")
    
    df2 <- data.frame(
      var1 = c(9, 10, 11, 12),
      var2 = c(13, 14, 15, 16)
    )


m2 <- as.matrix(df2)



    rownames(m2) <- c("red", "green", "black", "white")
    
    
    df3 <- data.frame(
      var1 = c(17, 18, 19, 20),
      var2 = c(21, 22, 23, 24)
    )


m3 <- as.matrix(df3)

    rownames(m3) <- c("red", "green", "white", "white")
    
    
    df4 <- data.frame(
      var1 = c(25, 26, 27, 28),
      var2 = c(29, 30, 31, 32)
    )

m4 <- as.matrix(df4)


    rownames(m4) <- c("black", "green", "white", "white")
    
    # Combine the matrices into a list
    list_df <- list(m1, m2, m3, m4)


I want the output to be :
    
     
        dffinal <- data.frame(
          mean = c(mean(red), mean(green), mean(black), mean(white)),
          lower = c(lowerCI(red), lowerCI(green), lowerCI(black), lowerCI(white))
           upper  c(upperCI(red), upperCI(green), upperCI(black), lowerCI(white))
        )
    
    #with rownames being like this
        rownames(dffinal) <-  c("red", "green", "black", "white")

Upvotes: 0

Views: 75

Answers (2)

TarJae
TarJae

Reputation: 79184

Something like this:

library(dplyr)
library(tidyr)
library(tibble)

bind_rows(list(df1 = df1, df2 = df2, df3 = df3, df4=df4), .id = 'id') %>% 
  rownames_to_column("colors") %>% 
  mutate(colors = gsub("([A-Za-z]+).*", "\\1", colors)) %>% 
  pivot_longer(starts_with("var")) %>% 
  group_by(id, colors) %>% 
  # group_by(colors) %>% 
  summarise( n = sum(value),
             mean_value = mean(value, na.rm = TRUE),
             se = sd(value, na.rm = TRUE) / sqrt(n),
             ci = 1.96 * se,
             lb = mean_value - ci,
             ub = mean_value + ci)
             )
  id    colors     n mean_value    se    ci     lb    ub
   <chr> <chr>  <dbl>      <dbl> <dbl> <dbl>  <dbl> <dbl>
 1 df1   black     10          5 0.894 1.75   3.25   6.75
 2 df1   blue       6          3 1.15  2.26   0.737  5.26
 3 df1   purple     8          4 1     1.96   2.04   5.96
 4 df1   white     12          6 0.816 1.60   4.40   7.60
 5 df2   black     26         13 0.555 1.09  11.9   14.1 
 6 df2   green     24         12 0.577 1.13  10.9   13.1 
 7 df2   red       22         11 0.603 1.18   9.82  12.2 
 8 df2   white     28         14 0.535 1.05  13.0   15.0 
 9 df3   orange    42         21 0.436 0.855 20.1   21.9 
10 df3   purple    38         19 0.459 0.899 18.1   19.9 
11 df3   white     44         22 0.426 0.836 21.2   22.8 
12 df3   yellow    40         20 0.447 0.877 19.1   20.9 
13 df4   black     56         28 0.378 0.741 27.3   28.7 
14 df4   blue      54         27 0.385 0.754 26.2   27.8 
15 df4   pink      58         29 0.371 0.728 28.3   29.7 
16 df4   white     60         30 0.365 0.716 29.3   30.7

Upvotes: 1

akrun
akrun

Reputation: 887681

We could do

library(purrr)
library(dplyr) #version >= 1.1.0
library(qwraps2)
library(tidyr)
map_dfr(list_df, ~ as_tibble(.x) %>%
     mutate(color = row.names(.x), .before = 1) %>%
     mutate(rn = row_number())) %>%
  pivot_longer(cols = c(var1, var2)) %>% 
 group_by(color, rn) %>% 
 reframe(value = mean(value)) %>%
 group_by(color) %>% 
 reframe(out = list(mean_ci(value))) %>%
 unnest_wider(out)

-output

# A tibble: 4 × 4
  color mean       lcl        ucl       
  <chr> <qwrps2__> <qwrps2__> <qwrps2__>
1 black 18.0        0.3603241 35.63968  
2 green 16.0               NA       NA  
3 red   11.0               NA       NA  
4 white 21.5       14.6401261 28.35987  

data

list_df <- list(structure(c(1, 2, 3, 4, 5, 6, 7, 8),
 dim = c(4L, 2L), dimnames = list(
    c("red", "green", "black", "white"), c("var1", "var2"))), 
    structure(c(9, 10, 11, 12, 13, 14, 15, 16), dim = c(4L, 2L
    ), dimnames = list(c("red", "green", "black", "white"), c("var1", 
    "var2"))), structure(c(17, 18, 19, 20, 21, 22, 23, 24), dim = c(4L, 
    2L), dimnames = list(c("red", "green", "white", "white"), 
        c("var1", "var2"))), structure(c(25, 26, 27, 28, 29, 
    30, 31, 32), dim = c(4L, 2L), dimnames = list(c("black", 
    "green", "white", "white"), c("var1", "var2"))))

Upvotes: 3

Related Questions