Omry Atia
Omry Atia

Reputation: 2443

in r keep group elements starting from first non-missing until last

I would like to retain, for each group of a data frame, the first non-missing element until the last element (even if the last is missing). For example:

df <- structure(list(Ind = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), result = c(81, 
90, NA, NA, 84, 86, 101, 134, NA, 78, NA, 103, NA, 112, NA, 111, 
NA, 110, 256, 123, NA, NA, 194, 301, 98.5, 151, NA, 105, NA, 
158, NA, 190, NA, 167, NA, 190, NA, NA, 113, 157, NA, 197, 155, 
NA, 178, 146, NA, 153, 153, 147, NA, 170, NA, 157)), row.names = c(NA, 
-54L), class = c("tbl_df", "tbl", "data.frame"))

In this example, group 3 should start from the 3rd observation and 1 and 2 should remain the same.

Upvotes: 3

Views: 141

Answers (2)

Ma&#235;l
Ma&#235;l

Reputation: 51974

dplyr

You can use dplyr::cumany, which will set to TRUE the first and subsequent values that are not NAs (!is.na(result)).

df %>% 
  group_by(Ind) %>% 
  filter(cumany(!is.na(result)))

base R

Use Reduce with accumulate = T to mimic dplyr::cumany(!is.na(result)) and ave to do it by group.

df[as.logical(ave(df$result, df$Ind, FUN = function(x) Reduce(any, !is.na(x), accumulate = TRUE))),]

output

# A tibble: 52 x 2
# Groups:   Ind [3]
     Ind result
   <int>  <dbl>
 1     1   81  
 2     1   90  
 3     1   NA  
 4     1   NA  
 5     1   84  
 6     1   86  
 7     1  101  
 8     1  134  
 9     1   NA  
10     1   78  
11     1   NA  
12     1  103  
13     1   NA  
14     1  112  
15     1   NA  
16     1  111  
17     1   NA  
18     1  110  
19     2  256  
20     2  123  
21     2   NA  
22     2   NA  
23     2  194  
24     2  301  
25     2   98.5
26     2  151  
27     2   NA  
28     2  105  
29     2   NA  
30     2  158  
31     2   NA  
32     2  190  
33     2   NA  
34     2  167  
35     2   NA  
36     2  190  
37     3  113  
38     3  157  
39     3   NA  
40     3  197  
41     3  155  
42     3   NA  
43     3  178  
44     3  146  
45     3   NA  
46     3  153  
47     3  153  
48     3  147  
49     3   NA  
50     3  170  
51     3   NA  
52     3  157

Upvotes: 1

PaulS
PaulS

Reputation: 25323

Another possible solution:

library(tidyverse)

df <- structure(list(Ind = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), result = c(81, 
90, NA, NA, 84, 86, 101, 134, NA, 78, NA, 103, NA, 112, NA, 111, 
NA, 110, 256, 123, NA, NA, 194, 301, 98.5, 151, NA, 105, NA, 
158, NA, 190, NA, 167, NA, 190, NA, NA, 113, 157, NA, 197, 155, 
NA, 178, 146, NA, 153, 153, 147, NA, 170, NA, 157)), row.names = c(NA, 
-54L), class = c("tbl_df", "tbl", "data.frame"))

df %>% 
  group_by(Ind) %>% 
  mutate(aux = which.min(is.na(result))) %>% 
  slice(unique(aux):n()) %>% select(-aux) %>% 
  ungroup 

#>    Ind result
#> 1    1   81.0
#> 2    1   90.0
#> 3    1     NA
#> 4    1     NA
#> 5    1   84.0
#> 6    1   86.0
#> 7    1  101.0
#> 8    1  134.0
#> 9    1     NA
#> 10   1   78.0
#> 11   1     NA
#> 12   1  103.0
#> 13   1     NA
#> 14   1  112.0
#> 15   1     NA
#> 16   1  111.0
#> 17   1     NA
#> 18   1  110.0
#> 19   2  256.0
#> 20   2  123.0
#> 21   2     NA
#> 22   2     NA
#> 23   2  194.0
#> 24   2  301.0
#> 25   2   98.5
#> 26   2  151.0
#> 27   2     NA
#> 28   2  105.0
#> 29   2     NA
#> 30   2  158.0
#> 31   2     NA
#> 32   2  190.0
#> 33   2     NA
#> 34   2  167.0
#> 35   2     NA
#> 36   2  190.0
#> 37   3  113.0
#> 38   3  157.0
#> 39   3     NA
#> 40   3  197.0
#> 41   3  155.0
#> 42   3     NA
#> 43   3  178.0
#> 44   3  146.0
#> 45   3     NA
#> 46   3  153.0
#> 47   3  153.0
#> 48   3  147.0
#> 49   3     NA
#> 50   3  170.0
#> 51   3     NA
#> 52   3  157.0

Upvotes: 1

Related Questions