Reputation: 475
I would like to gapfill missing data (NA) by sampling from non-NA values from the group that the missing data belongs to.
group = Outlet
Also, if nrows(group) == 1 then draw random values from all groups (e.g. Outlet == ESO0244476).
Here's what I've tried so far:
Sample data
> dput(droplevels((example)))
structure(list(LENGTH = c(NA, 45447.4157838775, 161037.71538108,
78147.8550470324, 7193.48815617057, 1571.95459212405, 18191.381972185,
20366.2132412031, 10014.987524596, 1403.72511829297, 5651.17842991513,
6848.03271105711, 8043.32937011393, 8926.65133418451, 5808.44456603825,
2208.14264175252, 1797.4936747033, 5325.76651327694, 2660.66730207955,
5844.07912541444, 3956.40473896271, 959.873314407621, 3294.01472360025,
5221.94864001864, 3781.51913857335, 7811.83819953768, 3387.20323328623,
5514.92099458441, 5792.54371531706, 5643.98385143961, 15478.916809379,
8401.66533205217, 7046.25074819247, 2734.73639821402, NA, 62332.3343404513,
NA, 46563.1214718113, 25590.4020105238, 13015.3682275862, 4984.80432801441,
NA), NUMPOINTS = c(NA, 0, 8, 5, 2, 0, 9, 0, 0, 0, 3, 1, 0, 6,
1, 1, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, NA,
10, NA, 19, 6, 5, 0, NA), CTRY_ = structure(c(1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 2L, 1L), .Label = c("WCY_____ES", "WCY_____FR",
"WCY_____IT"), class = "factor"), Outlet = structure(c(4L, 3L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("DSO0045543", "DSO0208527",
"DSO0213034", "ESO0244476"), class = "factor")), row.names = c(NA,
-42L), class = "data.frame")
Sample code
exam %>%
mutate(nuLENGTH = if_else(CTRY_ == 'WCY_____FR', LENGTH, sample(LENGTH[!is.na(LENGTH) & Outlet == Outlet], 1, TRUE)),
nuNUMPOINTS = if_else(CTRY_ == 'WCY_____FR', NUMPOINTS, sample(NUMPOINTS[!is.na(NUMPOINTS) & Outlet == Outlet], 1, TRUE)))
Within each group I expect the values for nuLENGTH
and nuNUMPOINTS
where CTRY_ != 'WCY_____FR'
to be different between groups as well as within a group, not the same e.g.
LENGTH NUMPOINTS CTRY_ Outlet nuLENGTH nuNUMPOINTS
1 NA NA WCY_____ES ESO0244476 1000.0000 0
2 45447.4158 0 WCY_____FR DSO0213034 45447.4158 0
3 161037.7154 8 WCY_____FR DSO0213034 161037.7154 8
4 78147.8550 5 WCY_____FR DSO0213034 78147.8550 5
5 7193.4882 2 WCY_____FR DSO0045543 7193.4882 2
6 1571.9546 0 WCY_____FR DSO0045543 1571.9546 0
7 18191.3820 9 WCY_____FR DSO0045543 18191.3820 9
8 20366.2132 0 WCY_____FR DSO0045543 20366.2132 0
9 10014.9875 0 WCY_____FR DSO0045543 10014.9875 0
10 1403.7251 0 WCY_____FR DSO0045543 1403.7251 0
11 5651.1784 3 WCY_____FR DSO0045543 5651.1784 3
12 6848.0327 1 WCY_____FR DSO0045543 6848.0327 1
13 8043.3294 0 WCY_____FR DSO0045543 8043.3294 0
14 8926.6513 6 WCY_____FR DSO0045543 8926.6513 6
15 5808.4446 1 WCY_____FR DSO0045543 5808.4446 1
16 2208.1426 1 WCY_____FR DSO0045543 2208.1426 1
17 1797.4937 0 WCY_____FR DSO0045543 1797.4937 0
18 5325.7665 0 WCY_____FR DSO0045543 5325.7665 0
19 2660.6673 1 WCY_____FR DSO0045543 2660.6673 1
20 5844.0791 0 WCY_____FR DSO0045543 5844.0791 0
21 3956.4047 0 WCY_____FR DSO0045543 3956.4047 0
22 959.8733 0 WCY_____FR DSO0045543 959.8733 0
23 3294.0147 1 WCY_____FR DSO0045543 3294.0147 1
24 5221.9486 2 WCY_____FR DSO0045543 5221.9486 2
25 3781.5191 0 WCY_____FR DSO0045543 3781.5191 0
26 7811.8382 0 WCY_____FR DSO0045543 7811.8382 0
27 3387.2032 0 WCY_____FR DSO0045543 3387.2032 0
28 5514.9210 0 WCY_____FR DSO0045543 5514.9210 0
29 5792.5437 1 WCY_____FR DSO0045543 5792.5437 1
30 5643.9839 0 WCY_____FR DSO0045543 5643.9839 0
31 15478.9168 0 WCY_____FR DSO0045543 15478.9168 0
32 8401.6653 0 WCY_____FR DSO0045543 8401.6653 0
33 7046.2507 1 WCY_____FR DSO0045543 7046.2507 1
34 2734.7364 0 WCY_____FR DSO0045543 2734.7364 0
35 NA NA WCY_____IT DSO0045543 7046.2507 0
36 62332.3343 10 WCY_____FR DSO0045543 62332.3343 10
37 NA NA WCY_____IT DSO0045543 3000.0000 0
38 46563.1215 19 WCY_____FR DSO0208527 46563.1215 19
39 25590.4020 6 WCY_____FR DSO0208527 25590.4020 6
40 13015.3682 5 WCY_____FR DSO0208527 13015.3682 5
41 4984.8043 0 WCY_____FR DSO0208527 4984.8043 0
42 NA NA WCY_____ES DSO0208527 2000.0000 0
Upvotes: 2
Views: 370
Reputation: 389135
We can write a function if there is NA
value in LENGTH
and number of rows in group is more than 1, select random value from the group excluding NA
values. If the number of values in group is just 1 then select random value from the entire data excluding NA
values. We apply the function for both the columns LENGTH
and NUMPOINTS
.
replace_func <- function(x, y) {
inds <- is.na(x)
if (length(x) > 1 & any(inds)) {
x[inds] <- sample(x[!inds], sum(inds))
x
}
else if(any(inds)) {
x[inds] <- sample(y[!is.na(y)], 1)
x
} else x
}
library(dplyr)
exam %>%
group_by(Outlet) %>%
mutate(nuLENGTH = replace_func(LENGTH, .$LENGTH),
nuNUMPOINTS = replace_func(NUMPOINTS, .$NUMPOINTS))
# LENGTH NUMPOINTS CTRY_ Outlet nuLENGTH nuNUMPOINTS
# <dbl> <dbl> <fct> <fct> <dbl> <dbl>
# 1 NA NA WCY_____ES ESO0244476 7046. 19
# 2 45447. 0 WCY_____FR DSO0213034 45447. 0
# 3 161038. 8 WCY_____FR DSO0213034 161038. 8
# 4 78148. 5 WCY_____FR DSO0213034 78148. 5
# 5 7193. 2 WCY_____FR DSO0045543 7193. 2
# 6 1572. 0 WCY_____FR DSO0045543 1572. 0
# 7 18191. 9 WCY_____FR DSO0045543 18191. 9
# 8 20366. 0 WCY_____FR DSO0045543 20366. 0
# 9 10015. 0 WCY_____FR DSO0045543 10015. 0
#10 1404. 0 WCY_____FR DSO0045543 1404. 0
# … with 32 more rows
Upvotes: 1
Reputation: 1784
Here's an option with dplyr
But as @Ronak said, there is no other observations in the same group to sample from in your first line.
example %>%
group_by(Outlet) %>%
mutate(
nuLENGTH = case_when(
is.na(LENGTH) ~ sample(LENGTH,size=1),
!is.na(LENGTH) ~ LENGTH),
nuNUMPOINTS = case_when(
is.na(NUMPOINTS) ~ sample(NUMPOINTS,size=1),
!is.na(NUMPOINTS) ~ NUMPOINTS))
Upvotes: 1