Michael Zhao
Michael Zhao

Reputation: 31

R data imputation from group_by table based on count

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)

test = data.frame(group, animal, sleep)
print(test)

group_animal = test %>% group_by(`group`, `animal`) %>% count(sleep)
print(group_animal)

I would like to replace the NA values in the test df's sleep column by the highest count of sleep answer based on group and animal.

Such that Group 1, Animal a with NAs in the sleep column should have a sleep value of 'y' because that is the value with the highest count among Group 1 Animal a.

Group 4 animal c with NAs for sleep should have 'n' as the sleep value as well.

Upvotes: 1

Views: 62

Answers (3)

TarJae
TarJae

Reputation: 78927

Update now with group_by(group, animal) thnx @Quinten, removed prior answer:

  1. group by animal

  2. use replace_na with the replace argument as sleep[n==max(n)]

  3. new: in case of ties like in group 5 add !is.na(sleep) to avoid conflicts:

library(dplyr)
library(tidyr)

group_animal %>%
    group_by(group, animal) %>% 
    arrange(desc(sleep), .by_group = TRUE) %>% 
    mutate(sleep = replace_na(sleep, sleep[n==max(n) & !is.na(sleep)])) 
  group animal sleep     n
   <dbl> <chr>  <chr> <int>
 1     1 a      y         3
 2     1 a      n         1
 3     1 a      m         1
 4     1 a      y         1
 5     1 b      n         2
 6     4 c      y         2
 7     4 c      n         4
 8     4 c      n         1
 9     4 d      y         2
10     5 a      n         1
11     5 a      n         1
12     5 b      n         1
13     6 c      y         2
14     6 c      n         1
15     6 c      y         1

Upvotes: 3

user6836753
user6836753

Reputation:

Try this.

This method essential creates a custom column to coalesce with sleep, it subsets sleep based on the max count values obtained from str_count

library(dplyr)

test |> 
  group_by(group, animal) |> 
  mutate(sleep = coalesce(sleep, sleep[max(stringr::str_count(paste(sleep, collapse = ""), pattern = sleep), na.rm = TRUE)])) |>
  ungroup()
   group animal sleep
1      1      a     y
2      1      b     n
3      4      c     y
4      4      c     y
5      4      d     y
6      5      a     n
7      5      b     n
8      6      c     y
9      1      b     n
10     4      d     y
11     6      c     n
12     1      a     y
13     1      a     y
14     1      a     n
15     1      a     m
16     6      c     y
17     4      c     n
18     4      c     n
19     4      c     n
20     4      c     n
21     1      a     y
22     4      c     n
23     5      a     n
24     6      c     n

Upvotes: 1

Quinten
Quinten

Reputation: 41285

Another option is replacing the NAs with the Mode. You can use the Mode function from this post in the na.aggregate function from zoo to replace these NAs like this:

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)
test = data.frame(group, animal, sleep)

library(dplyr)
library(zoo)
test %>%
  group_by(group, animal) %>%
  mutate(sleep = na.aggregate(sleep , FUN=Mode)) %>%
  ungroup()
#> # A tibble: 24 × 3
#>    group animal sleep
#>    <dbl> <chr>  <chr>
#>  1     1 a      y    
#>  2     1 b      n    
#>  3     4 c      y    
#>  4     4 c      y    
#>  5     4 d      y    
#>  6     5 a      n    
#>  7     5 b      n    
#>  8     6 c      y    
#>  9     1 b      n    
#> 10     4 d      y    
#> # … with 14 more rows
#> # ℹ Use `print(n = ...)` to see more rows

Created on 2022-07-26 by the reprex package (v2.0.1)

Here is tail of output:

> tail(test)
# A tibble: 6 × 3
  group animal sleep
  <dbl> <chr>  <chr>
1     4 c      n    
2     4 c      n    
3     1 a      y    
4     4 c      n    
5     5 a      n    
6     6 c      y 

Upvotes: 3

Related Questions