StLouisO
StLouisO

Reputation: 71

Compare one group to the rest of the groups as a whole in R

Here is some sample data:

movie_df <- data.frame("ID" = c(1,2,3,4,5,6,7,8,9,10),
                        "movie_type" = c("Action", "Horror", "Comedy", "Thriller", "Comedy", 
                                         "Action","Thriller", "Horror", "Action", "Comedy"),
                        "snack_type" = c("Chocolate", "Popcorn", "Candy", "Popcorn", "Popcorn", 
                                         "Candy","Chocolate", "Candy", "Popcorn", "Chocolate"),
                        "event_type" = c("Solo", "Family", "Date", "Friends", "Solo", 
                                         "Family","Date", "Date", "Friends", "Friends"),
                        "total_cost" = c(50, 35, 20, 50, 30,
                                         60, 25, 35, 20, 50))

What I want to do is go through each column and compare each group to the rest of the groups on total_cost. For example, I want to see how movie_type == 'Action' compares to movie_type != 'Action' for total_cost. I want to do that for every type in movie_type then every type in snack_type and event_type.

What I ultimately want to get to is this where sd = Standard Deviation. Ideally this will be done by a tidyverse method in R (e.g. dplyr or tidyr):

> results_df
# A tibble: 11 x 11
   Group      Grp_1     Grp_2         Grp_1_mean Grp_2_mean Grp_1_sd Grp_2_sd Grp_1_n Grp_2_n Mean_Diff `t-test`
   <chr>      <chr>     <chr>              <dbl>      <dbl>    <dbl>    <dbl>   <dbl>   <dbl>     <dbl>    <dbl>
 1 movie_type Action    Rest of group       43.3       35      20.8      11.5       3       7      8.33    2.84 
 2 movie_type Horror    Rest of group       35         38.1     0        16.0       2       8     -3.12   -2.21 
 3 movie_type Thriller  Rest of group       37.5       37.5    17.7      14.6       2       8      0       0    
 4 movie_type Comedy    Rest of group       33.3       39.3    15.3      14.6       3       7     -5.95   -2.22 
 5 snack_type Chocolate Rest of group       41.7       35.7    14.4      14.8       3       7      5.95    2.26 
 6 snack_type Candy     Rest of group       38.3       37.1    20.2      12.9       3       7      1.19    0.407
 7 snack_type Popcorn   Rest of group       33.8       40      12.5      15.8       4       6     -6.25   -2.60 
 8 event_type Date      Rest of group       26.7       42.1     7.64     14.1       3       7    -15.5    -7.25 
 9 event_type Family    Rest of group       47.5       35      17.7      13.4       2       8     12.5     3.86 
10 event_type Friends   Rest of group       40         36.4    17.3      14.1       3       7      3.57    1.28 
11 event_type Solo      Rest of group       40         36.9    14.1      15.1       2       8      3.12    1.04 

Upvotes: 0

Views: 262

Answers (2)

det
det

Reputation: 5232

It's same logic as Daniel did using purrr::map and purrr::map2.

library(dplyr)
library(tibble)
library(purrr)
library(stringr)

needed_cols <- c("movie_type", "snack_type", "event_type")
new_names <- 1:2 %>%
  map(~str_c(c("group", "mean", "sd", "n"), "_", .x)) %>%
  unlist()

my_data <- needed_cols %>%
  map(function(df_c) 
    map(unique(movie_df[[df_c]]), 
        function(v){

          df <- movie_df %>% 
            mutate(group = ifelse(get(df_c) == v, v, "rest_of_group")) %>%
            group_by(group) %>%
            summarize(mean = mean(total_cost), sd = sd(total_cost), n = n()) %>%
            .[match(.$group, c(v, "rest_of_group")),]

          df <- bind_cols(df[1, ], df[2,])
          names(df) <- new_names

          df

        }
    )
  ) %>%
  map2(needed_cols, ~bind_rows(.x) %>% mutate(group = .y)) %>%
  bind_rows() %>%
  select(
    str_subset(names(.), "group") %>% sort(),
    str_subset(names(.), "mean"),
    str_subset(names(.), "sd"),
    str_subset(names(.), "n")
  ) %>%
  mutate(mean_diff = mean_1 - mean_2)

Upvotes: 1

Daniel O
Daniel O

Reputation: 4358

Sorry its not in pipes, but in Base R we can:

results_df <- do.call(rbind,unlist(
    apply(movie_df[,2:4],2,function(u)
        lapply(unique(u), function(x)
            data.frame(
                group1 = as.character(x),
                group2 = "rest",
                grp1_mean = mean(movie_df$total_cost[u == x]),
                grp2_mean = mean(movie_df$total_cost[u != x]),
                grp1_sd = sd(movie_df$total_cost[u == x]),
                grp2_sd = sd(movie_df$total_cost[u != x])
            )
        )
    ),recursive=F)
)


#add mean differences 
results_df$meandiff <- with(results_df, grp1_mean - grp2_mean)




> results_df
               group1 group2 grp1_mean grp2_mean   grp1_sd  grp2_sd   meandiff
movie_type1    Action   rest  43.33333  35.00000 20.816660 11.54701   8.333333
movie_type2    Horror   rest  35.00000  38.12500  0.000000 16.02175  -3.125000
movie_type3    Comedy   rest  33.33333  39.28571 15.275252 14.55695  -5.952381
movie_type4  Thriller   rest  37.50000  37.50000 17.677670 14.63850   0.000000
snack_type1 Chocolate   rest  41.66667  35.71429 14.433757 14.84042   5.952381
snack_type2   Popcorn   rest  33.75000  40.00000 12.500000 15.81139  -6.250000
snack_type3     Candy   rest  38.33333  37.14286 20.207259 12.86375   1.190476
event_type1      Solo   rest  40.00000  36.87500 14.142136 15.10381   3.125000
event_type2    Family   rest  47.50000  35.00000 17.677670 13.36306  12.500000
event_type3      Date   rest  26.66667  42.14286  7.637626 14.09998 -15.476190
event_type4   Friends   rest  40.00000  36.42857 17.320508 14.05770   3.571429

Upvotes: 0

Related Questions