Rasmus Larsen
Rasmus Larsen

Reputation: 6137

Nest a dataframe by group, but include extra rows within each group

Using the diamonds dataset, I am trying to run regression models comparing the color "D" to each of the other colors, within each level of cut (without specifying a interaction in the regression model).

For this purpose I am trying to create nested dataframes grouped by cut and color, but within each group I want the reference color "D" of the appropriate cut.

The following code does not do what I want, because each group of color does not contain the color "D":

    library(tidyverse)

> diamonds %>% 
+     group_by(cut, color) %>% 
+     nest() %>% arrange(cut, color)
# A tibble: 35 x 3
   cut   color data              
   <ord> <ord> <list>            
 1 Fair  D     <tibble [163 x 8]>
 2 Fair  E     <tibble [224 x 8]>
 3 Fair  F     <tibble [312 x 8]>
 4 Fair  G     <tibble [314 x 8]>
 5 Fair  H     <tibble [303 x 8]>
 6 Fair  I     <tibble [175 x 8]>
 7 Fair  J     <tibble [119 x 8]>
 8 Good  D     <tibble [662 x 8]>
 9 Good  E     <tibble [933 x 8]>
10 Good  F     <tibble [909 x 8]>
# ... with 25 more rows

The code below does the job, but I am looking for a tidyverse version:

data_fair_de          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "E"))           %>%    mutate( grouping_var  =  "data_fair_de" )    
data_fair_df          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "F"))           %>%    mutate( grouping_var  =  "data_fair_df" )    
data_fair_dg          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "G"))           %>%    mutate( grouping_var  =  "data_fair_dg" )    
data_fair_dh          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "H"))           %>%    mutate( grouping_var  =  "data_fair_dh" )    
data_fair_di          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "I"))           %>%    mutate( grouping_var  =  "data_fair_di" )    
data_fair_dj          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "J"))           %>%    mutate( grouping_var  =  "data_fair_dj" )    
data_good_de          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "E"))           %>%    mutate( grouping_var  =  "data_good_de "    )    
data_good_df          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "F"))           %>%    mutate( grouping_var  =  "data_good_df "    )    
data_good_dg          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "G"))           %>%    mutate( grouping_var  =  "data_good_dg "    )    
data_good_dh          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "H"))           %>%    mutate( grouping_var  =  "data_good_dh "    )    
data_good_di          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "I"))           %>%    mutate( grouping_var  =  "data_good_di "    )    
data_good_dj          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "J"))           %>%    mutate( grouping_var  =  "data_good_dj "    )    
data_very_de          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "E"))      %>%    mutate( grouping_var  =  "data_very_de "    )    
data_very_df          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "F"))      %>%    mutate( grouping_var  =  "data_very_df "    )    
data_very_dg          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "G"))      %>%    mutate( grouping_var  =  "data_very_dg "    )    
data_very_dh          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "H"))      %>%    mutate( grouping_var  =  "data_very_dh "    )    
data_very_di          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "I"))      %>%    mutate( grouping_var  =  "data_very_di "    )    
data_very_dj          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "J"))      %>%    mutate( grouping_var  =  "data_very_dj "    )    
data_premium_de       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "E"))        %>%    mutate( grouping_var  =  "data_premium_de " )    
data_premium_df       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "F"))        %>%    mutate( grouping_var  =  "data_premium_df " )    
data_premium_dg       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "G"))        %>%    mutate( grouping_var  =  "data_premium_dg " )    
data_premium_dh       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "H"))        %>%    mutate( grouping_var  =  "data_premium_dh " )    
data_premium_di       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "I"))        %>%    mutate( grouping_var  =  "data_premium_di " )    
data_premium_dj       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "J"))        %>%    mutate( grouping_var  =  "data_premium_dj " )    
data_ideal_de         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "E"))          %>%    mutate( grouping_var  =  "data_ideal_de "   )    
data_ideal_df         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "F"))          %>%    mutate( grouping_var  =  "data_ideal_df "   )    
data_ideal_dg         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "G"))          %>%    mutate( grouping_var  =  "data_ideal_dg "   )    
data_ideal_dh         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "H"))          %>%    mutate( grouping_var  =  "data_ideal_dh "   )    
data_ideal_di         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "I"))          %>%    mutate( grouping_var  =  "data_ideal_di "   )    
data_ideal_dj         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "J"))          %>%    mutate( grouping_var  =  "data_ideal_dj "   )    

bind_rows( 
 data_fair_de   ,  data_fair_df   ,  data_fair_dg   ,  data_fair_dh   ,  data_fair_di   ,  data_fair_dj   , 
 data_good_de   ,  data_good_df   ,  data_good_dg   ,  data_good_dh   ,  data_good_di   ,  data_good_dj   , 
 data_very_de   ,  data_very_df   ,  data_very_dg   ,  data_very_dh   ,  data_very_di   ,  data_very_dj   , 
 data_premium_de,  data_premium_df,  data_premium_dg,  data_premium_dh,  data_premium_di,  data_premium_dj, 
 data_ideal_de  ,  data_ideal_df  ,  data_ideal_dg  ,  data_ideal_dh  ,  data_ideal_di  ,  data_ideal_dj  ) %>% 
    group_by(grouping_var) %>% 
    nest()

Upvotes: 0

Views: 401

Answers (3)

AntoniosK
AntoniosK

Reputation: 16121

library(tidyverse)

# function to get data based on your filter
f = function(xx,yy,zz) {diamonds %>% filter(cut==xx & color %in% c(yy,zz))}

expand(diamonds, cut, color, color_D="D") %>%              # create all combinations of interest
  mutate_all(as.character) %>%                             # update to character variables
  rowwise() %>%                                            # for each row
  mutate(data = list(f(cut, color, color_D))) %>%          # apply your function
  ungroup() %>%                                            # forget the grouping
  filter(color != color_D)                                 # exclude cases where pair of colours is {D,D}

# # A tibble: 30 x 4
#     cut   color color_D data                 
#     <chr> <chr> <chr>   <list>               
#   1 Fair  E     D       <tibble [387 x 10]>  
#   2 Fair  F     D       <tibble [475 x 10]>  
#   3 Fair  G     D       <tibble [477 x 10]>  
#   4 Fair  H     D       <tibble [466 x 10]>  
#   5 Fair  I     D       <tibble [338 x 10]>  
#   6 Fair  J     D       <tibble [282 x 10]>  
#   7 Good  E     D       <tibble [1,595 x 10]>
#   8 Good  F     D       <tibble [1,571 x 10]>
#   9 Good  G     D       <tibble [1,533 x 10]>
#  10 Good  H     D       <tibble [1,364 x 10]>
#   # ... with 20 more rows

Upvotes: 1

moodymudskipper
moodymudskipper

Reputation: 47350

We can follow your idea and loop on the nested data.frames afterwards, adding rows of color D by group, and then removing the D rows:

diamonds %>%
  nest(-cut,-color) %>%
  group_by(cut) %>%
  mutate(data = map(data, ~ bind_rows(data[[which(color=="D")]], .x))) %>%
  ungroup %>%
  filter(color != "D") %>%
  arrange(cut, color)
# # A tibble: 30 x 3
#       cut color                 data
#     <ord> <ord>               <list>
#   1  Fair     E   <tibble [387 x 8]>
#   2  Fair     F   <tibble [475 x 8]>
#   3  Fair     G   <tibble [477 x 8]>
#   4  Fair     H   <tibble [466 x 8]>
#   5  Fair     I   <tibble [338 x 8]>
#   6  Fair     J   <tibble [282 x 8]>
#   7  Good     E <tibble [1,595 x 8]>
#   8  Good     F <tibble [1,571 x 8]>
#   9  Good     G <tibble [1,533 x 8]>
#  10  Good     H <tibble [1,364 x 8]>
#   # ... with 20 more rows

Upvotes: 1

AndS.
AndS.

Reputation: 8120

If you want to add D to every group you could just append each group with the appropriate "D"

df <- diamonds %>% 
    group_by(cut, color) %>% 
    nest() %>% 
    arrange(cut, color) %>%
    ungroup()%>%
    group_by(cut) %>%
    mutate(append_data = map(data, ~rbind(.x, data[[which(row_number() == 1)]])))

df
#    cut   color data               append_data         
#  1 Fair  D     <tibble [163 × 8]> <tibble [326 × 8]>  
#  2 Fair  E     <tibble [224 × 8]> <tibble [387 × 8]>  
#  3 Fair  F     <tibble [312 × 8]> <tibble [475 × 8]>  
#  4 Fair  G     <tibble [314 × 8]> <tibble [477 × 8]>  
#  5 Fair  H     <tibble [303 × 8]> <tibble [466 × 8]> 
#  # ... with 30 more rows

That should give you a new nested column that also has D per group.

Upvotes: 1

Related Questions