rjuri
rjuri

Reputation: 133

How to filter out a row if there are two consecutive instances of the same value?

I have a data frame with multiple similar sequences in which column Z has a string pattern containing "VALUE1" and "VALUE2" (only these two patterns matter) and I'd like to filter out all the rows with consecutive values of "VALUE2" so, that only the first instance of "VALUE2" in the sequence will be left. For example the data before filtering:

weight | height | Z
---------------------------
62      100      NA
65      89       NA
59      88       randomnumbersVALUE1randomtext
66      92       NA
64      90       NA
64      87       randomnumbersVALUE2randomtext
57      84       NA
68      99       randomnumbersVALUE2randomtext
59      82       NA
60      87       srebmunmodnarVALUE1txetmodnar
61      86       NA
63      84       srebmunmodnarVALUE2txetmodnar
56      79       srebmunmodnarVALUE2txetmodnar

..and the data after filtering:

weight | height | Z
---------------------------
62      100      NA
65      89       NA
59      88       randomnumbersVALUE1randomtext
66      92       NA
64      90       NA
64      87       randomnumbersVALUE2randomtext
57      84       NA
59      82       NA
60      87       srebmunmodnarVALUE1txetmodnar
61      86       NA
63      84       srebmunmodnarVALUE2txetmodnar

Basically afterwards there would always be an even number of rows with "VALUE1" and "VALUE2" after the filtering. How could I do this in R?

EDIT: Added a dput example of an actual data below!

df <- structure(list(time = c(2.407351, 2.509896, 2.611125, 2.713461, 
                              2.817164, 2.919744, 3.021705, 3.123486, 3.224377, 3.328928, 3.429269, 
                              3.53005, 3.633245, 3.733464, 3.833739, 3.935982, 4.040218, 4.140593, 
                              4.240596, 4.341586, 4.442056, 4.545733, 4.647588, 4.749775, 4.850202, 
                              4.954296, 5.057959, 5.158889, 5.261604, 5.361756, 5.464473, 5.567976, 
                              5.668781, 5.769724, 5.873468, 5.976534, 6.079786, 6.181607, 6.283529, 
                              6.387882, 6.490218, 6.593208, 6.695848, 6.797832, 6.899988, 7.002706, 
                              7.10434, 7.205279, 7.308736, 7.41244, 7.515162, 7.617599, 7.719701, 
                              7.822705, 7.923404, 8.02708, 8.127296, 8.229541, 8.332182, 8.434361, 
                              8.534611, 8.635554, 8.738974, 8.839022, 8.940971, 9.041932, 9.145604, 
                              9.245739, 9.346298, 9.447318, 9.548908, 9.651034, 9.751712, 9.852877, 
                              9.954411, 10.057554, 10.158954, 10.260013, 10.361611, 10.465738
), speed = c(-0.072754, -0.08009, -0.087517, 
             -0.09302, -0.096584, -0.100493, -0.102846, -0.103459, -0.105445, 
             -0.107495, -0.109174, -0.112349, -0.117944, -0.125746, -0.133584, 
             -0.141625, -0.151666, -0.160364, -0.167835, -0.177867, -0.185668, 
             -0.194588, -0.201844, -0.20854, -0.215367, -0.220329, -0.22669, 
             -0.230995, -0.23234, -0.234036, -0.234469, -0.232677, -0.231263, 
             -0.227353, -0.22052, -0.213042, -0.202084, -0.191173, -0.176828, 
             -0.158778, -0.138877, -0.116335, -0.092598, -0.065483, -0.038687, 
             -0.010695, 0.016711, 0.044115, 0.072605, 0.099908, 0.127516, 
             0.154357, 0.179351, 0.204669, 0.228667, 0.25158, 0.272296, 0.291927, 
             0.309133, 0.322194, 0.332866, 0.340746, 0.347923, 0.352051, 0.356205, 
             0.360027, 0.367622, 0.373352, 0.383057, 0.395343, 0.409964, 0.429023, 
             0.448791, 0.471514, 0.494944, 0.519479, 0.545365, 0.572908, 0.599505, 
             0.629952), Z = c("{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":2.32564,\"dist\":0.00353785}", 
                              "", "", "", "", "", "", "", "", "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":3.80562,\"dist\":0.00674181}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":4.30772,\"dist\":0.00873242}", 
                              "", "", "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":5.1461,\"dist\":0.0137308}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":5.64915,\"dist\":0.0178199}", 
                              "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":5.96298,\"dist\":0.020795}", 
                              "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":6.37879,\"dist\":0.0252221}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":6.87991,\"dist\":0.0312599}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":7.39267,\"dist\":0.038172}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":7.89454,\"dist\":0.0454437},{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":7.91958,\"dist\":0.0458214}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":8.42293,\"dist\":0.0536579}", 
                              "", "", "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":9.2023,\"dist\":0.0661186}", 
                              "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":9.70275,\"dist\":0.0742744}", 
                              "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":10.356,\"dist\":0.0850383}", 
                              "")), row.names = c(NA, 80L), class = "data.frame")

For some reason with the script I only get rows with either VALUE1 or VALUE2 and the rows with "NA" are filtered out?

Upvotes: 0

Views: 689

Answers (1)

Andy Baxter
Andy Baxter

Reputation: 7626

This might be a roundabout way of doing it but should work. It extracts the value to test for, fills down to 'group' all following rows by that value, then notes which one have a value in Z which matches the value inherited from above and filters out these rows:

library(tidyverse)

df <- tribble(~weight,  ~height,   ~Z,
               62,       100,       NA,
               65,       89 ,       NA,
               59,       88 ,       "randomnumbersVALUE1randomtext",
               66,       92 ,       NA,
               64,       90 ,       NA,
               64,       87 ,       "randomnumbersVALUE2randomtext",
               57,       84 ,       NA,
               68,       99 ,       "randomnumbersVALUE2randomtext",
               59,       82 ,       NA,
               60,       87 ,       "srebmunmodnarVALUE1txetmodnar",
               61,       86 ,       NA,
               63,       84 ,       "srebmunmodnarVALUE2txetmodnar",
               56,       79 ,       "srebmunmodnarVALUE2txetmodnar")

df |> 
  mutate(value = str_extract(Z, "VALUE\\d")) |> 
  fill(value, .direction = "down") |> 
  mutate(new = value != lag(value)) |> 
  filter(new | is.na(Z) | is.na(new)) |> 
  select(-value, -new)

#> # A tibble: 11 × 3
#>    weight height Z                            
#>     <dbl>  <dbl> <chr>                        
#>  1     62    100 <NA>                         
#>  2     65     89 <NA>                         
#>  3     59     88 randomnumbersVALUE1randomtext
#>  4     66     92 <NA>                         
#>  5     64     90 <NA>                         
#>  6     64     87 randomnumbersVALUE2randomtext
#>  7     57     84 <NA>                         
#>  8     59     82 <NA>                         
#>  9     60     87 srebmunmodnarVALUE1txetmodnar
#> 10     61     86 <NA>                         
#> 11     63     84 srebmunmodnarVALUE2txetmodnar

Created on 2022-05-06 by the reprex package (v2.0.1)

Upvotes: 2

Related Questions