Josh J
Josh J

Reputation: 475

Replace NA with sample() by group

I would like to gapfill missing data (NA) by sampling from non-NA values from the group that the missing data belongs to.

group = Outlet

Also, if nrows(group) == 1 then draw random values from all groups (e.g. Outlet == ESO0244476).

Here's what I've tried so far:

Sample data

> dput(droplevels((example)))
structure(list(LENGTH = c(NA, 45447.4157838775, 161037.71538108, 
78147.8550470324, 7193.48815617057, 1571.95459212405, 18191.381972185, 
20366.2132412031, 10014.987524596, 1403.72511829297, 5651.17842991513, 
6848.03271105711, 8043.32937011393, 8926.65133418451, 5808.44456603825, 
2208.14264175252, 1797.4936747033, 5325.76651327694, 2660.66730207955, 
5844.07912541444, 3956.40473896271, 959.873314407621, 3294.01472360025, 
5221.94864001864, 3781.51913857335, 7811.83819953768, 3387.20323328623, 
5514.92099458441, 5792.54371531706, 5643.98385143961, 15478.916809379, 
8401.66533205217, 7046.25074819247, 2734.73639821402, NA, 62332.3343404513, 
NA, 46563.1214718113, 25590.4020105238, 13015.3682275862, 4984.80432801441, 
NA), NUMPOINTS = c(NA, 0, 8, 5, 2, 0, 9, 0, 0, 0, 3, 1, 0, 6, 
1, 1, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, NA, 
10, NA, 19, 6, 5, 0, NA), CTRY_ = structure(c(1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 
3L, 2L, 2L, 2L, 2L, 1L), .Label = c("WCY_____ES", "WCY_____FR", 
"WCY_____IT"), class = "factor"), Outlet = structure(c(4L, 3L, 
3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("DSO0045543", "DSO0208527", 
"DSO0213034", "ESO0244476"), class = "factor")), row.names = c(NA, 
-42L), class = "data.frame")

Sample code

exam %>%
  mutate(nuLENGTH = if_else(CTRY_ == 'WCY_____FR', LENGTH, sample(LENGTH[!is.na(LENGTH) & Outlet == Outlet], 1, TRUE)),
         nuNUMPOINTS = if_else(CTRY_ == 'WCY_____FR', NUMPOINTS, sample(NUMPOINTS[!is.na(NUMPOINTS) & Outlet == Outlet], 1, TRUE))) 

Within each group I expect the values for nuLENGTH and nuNUMPOINTS where CTRY_ != 'WCY_____FR' to be different between groups as well as within a group, not the same e.g.

        LENGTH NUMPOINTS      CTRY_     Outlet    nuLENGTH nuNUMPOINTS
1           NA        NA WCY_____ES ESO0244476   1000.0000           0
2   45447.4158         0 WCY_____FR DSO0213034  45447.4158           0
3  161037.7154         8 WCY_____FR DSO0213034 161037.7154           8
4   78147.8550         5 WCY_____FR DSO0213034  78147.8550           5
5    7193.4882         2 WCY_____FR DSO0045543   7193.4882           2
6    1571.9546         0 WCY_____FR DSO0045543   1571.9546           0
7   18191.3820         9 WCY_____FR DSO0045543  18191.3820           9
8   20366.2132         0 WCY_____FR DSO0045543  20366.2132           0
9   10014.9875         0 WCY_____FR DSO0045543  10014.9875           0
10   1403.7251         0 WCY_____FR DSO0045543   1403.7251           0
11   5651.1784         3 WCY_____FR DSO0045543   5651.1784           3
12   6848.0327         1 WCY_____FR DSO0045543   6848.0327           1
13   8043.3294         0 WCY_____FR DSO0045543   8043.3294           0
14   8926.6513         6 WCY_____FR DSO0045543   8926.6513           6
15   5808.4446         1 WCY_____FR DSO0045543   5808.4446           1
16   2208.1426         1 WCY_____FR DSO0045543   2208.1426           1
17   1797.4937         0 WCY_____FR DSO0045543   1797.4937           0
18   5325.7665         0 WCY_____FR DSO0045543   5325.7665           0
19   2660.6673         1 WCY_____FR DSO0045543   2660.6673           1
20   5844.0791         0 WCY_____FR DSO0045543   5844.0791           0
21   3956.4047         0 WCY_____FR DSO0045543   3956.4047           0
22    959.8733         0 WCY_____FR DSO0045543    959.8733           0
23   3294.0147         1 WCY_____FR DSO0045543   3294.0147           1
24   5221.9486         2 WCY_____FR DSO0045543   5221.9486           2
25   3781.5191         0 WCY_____FR DSO0045543   3781.5191           0
26   7811.8382         0 WCY_____FR DSO0045543   7811.8382           0
27   3387.2032         0 WCY_____FR DSO0045543   3387.2032           0
28   5514.9210         0 WCY_____FR DSO0045543   5514.9210           0
29   5792.5437         1 WCY_____FR DSO0045543   5792.5437           1
30   5643.9839         0 WCY_____FR DSO0045543   5643.9839           0
31  15478.9168         0 WCY_____FR DSO0045543  15478.9168           0
32   8401.6653         0 WCY_____FR DSO0045543   8401.6653           0
33   7046.2507         1 WCY_____FR DSO0045543   7046.2507           1
34   2734.7364         0 WCY_____FR DSO0045543   2734.7364           0
35          NA        NA WCY_____IT DSO0045543   7046.2507           0
36  62332.3343        10 WCY_____FR DSO0045543  62332.3343          10
37          NA        NA WCY_____IT DSO0045543   3000.0000           0
38  46563.1215        19 WCY_____FR DSO0208527  46563.1215          19
39  25590.4020         6 WCY_____FR DSO0208527  25590.4020           6
40  13015.3682         5 WCY_____FR DSO0208527  13015.3682           5
41   4984.8043         0 WCY_____FR DSO0208527   4984.8043           0
42          NA        NA WCY_____ES DSO0208527   2000.0000          0

Upvotes: 2

Views: 370

Answers (2)

Ronak Shah
Ronak Shah

Reputation: 389135

We can write a function if there is NA value in LENGTH and number of rows in group is more than 1, select random value from the group excluding NA values. If the number of values in group is just 1 then select random value from the entire data excluding NA values. We apply the function for both the columns LENGTH and NUMPOINTS.

replace_func <- function(x, y) {
    inds <- is.na(x)
    if (length(x) > 1 & any(inds)) {
     x[inds] <- sample(x[!inds], sum(inds))
     x
    }
    else if(any(inds)) {
       x[inds] <- sample(y[!is.na(y)], 1)
       x
    } else x
}

library(dplyr) 

exam %>%
  group_by(Outlet) %>%
  mutate(nuLENGTH = replace_func(LENGTH, .$LENGTH), 
         nuNUMPOINTS = replace_func(NUMPOINTS, .$NUMPOINTS)) 

#  LENGTH NUMPOINTS CTRY_      Outlet     nuLENGTH nuNUMPOINTS
#     <dbl>     <dbl> <fct>      <fct>         <dbl>       <dbl>
# 1     NA         NA WCY_____ES ESO0244476    7046.          19
# 2  45447.         0 WCY_____FR DSO0213034   45447.           0
# 3 161038.         8 WCY_____FR DSO0213034  161038.           8
# 4  78148.         5 WCY_____FR DSO0213034   78148.           5
# 5   7193.         2 WCY_____FR DSO0045543    7193.           2
# 6   1572.         0 WCY_____FR DSO0045543    1572.           0
# 7  18191.         9 WCY_____FR DSO0045543   18191.           9
# 8  20366.         0 WCY_____FR DSO0045543   20366.           0
# 9  10015.         0 WCY_____FR DSO0045543   10015.           0
#10   1404.         0 WCY_____FR DSO0045543    1404.           0
# … with 32 more rows

Upvotes: 1

Fino
Fino

Reputation: 1784

Here's an option with dplyr

But as @Ronak said, there is no other observations in the same group to sample from in your first line.

example %>% 
  group_by(Outlet) %>% 

  mutate(

    nuLENGTH = case_when(
      is.na(LENGTH) ~ sample(LENGTH,size=1),
      !is.na(LENGTH) ~ LENGTH),

    nuNUMPOINTS = case_when(
      is.na(NUMPOINTS) ~ sample(NUMPOINTS,size=1),
      !is.na(NUMPOINTS) ~ NUMPOINTS))

Upvotes: 1

Related Questions