Reputation: 133
I have a data frame with multiple similar sequences in which column Z has a string pattern containing "VALUE1" and "VALUE2" (only these two patterns matter) and I'd like to filter out all the rows with consecutive values of "VALUE2" so, that only the first instance of "VALUE2" in the sequence will be left. For example the data before filtering:
weight | height | Z
---------------------------
62 100 NA
65 89 NA
59 88 randomnumbersVALUE1randomtext
66 92 NA
64 90 NA
64 87 randomnumbersVALUE2randomtext
57 84 NA
68 99 randomnumbersVALUE2randomtext
59 82 NA
60 87 srebmunmodnarVALUE1txetmodnar
61 86 NA
63 84 srebmunmodnarVALUE2txetmodnar
56 79 srebmunmodnarVALUE2txetmodnar
..and the data after filtering:
weight | height | Z
---------------------------
62 100 NA
65 89 NA
59 88 randomnumbersVALUE1randomtext
66 92 NA
64 90 NA
64 87 randomnumbersVALUE2randomtext
57 84 NA
59 82 NA
60 87 srebmunmodnarVALUE1txetmodnar
61 86 NA
63 84 srebmunmodnarVALUE2txetmodnar
Basically afterwards there would always be an even number of rows with "VALUE1" and "VALUE2" after the filtering. How could I do this in R?
EDIT: Added a dput example of an actual data below!
df <- structure(list(time = c(2.407351, 2.509896, 2.611125, 2.713461,
2.817164, 2.919744, 3.021705, 3.123486, 3.224377, 3.328928, 3.429269,
3.53005, 3.633245, 3.733464, 3.833739, 3.935982, 4.040218, 4.140593,
4.240596, 4.341586, 4.442056, 4.545733, 4.647588, 4.749775, 4.850202,
4.954296, 5.057959, 5.158889, 5.261604, 5.361756, 5.464473, 5.567976,
5.668781, 5.769724, 5.873468, 5.976534, 6.079786, 6.181607, 6.283529,
6.387882, 6.490218, 6.593208, 6.695848, 6.797832, 6.899988, 7.002706,
7.10434, 7.205279, 7.308736, 7.41244, 7.515162, 7.617599, 7.719701,
7.822705, 7.923404, 8.02708, 8.127296, 8.229541, 8.332182, 8.434361,
8.534611, 8.635554, 8.738974, 8.839022, 8.940971, 9.041932, 9.145604,
9.245739, 9.346298, 9.447318, 9.548908, 9.651034, 9.751712, 9.852877,
9.954411, 10.057554, 10.158954, 10.260013, 10.361611, 10.465738
), speed = c(-0.072754, -0.08009, -0.087517,
-0.09302, -0.096584, -0.100493, -0.102846, -0.103459, -0.105445,
-0.107495, -0.109174, -0.112349, -0.117944, -0.125746, -0.133584,
-0.141625, -0.151666, -0.160364, -0.167835, -0.177867, -0.185668,
-0.194588, -0.201844, -0.20854, -0.215367, -0.220329, -0.22669,
-0.230995, -0.23234, -0.234036, -0.234469, -0.232677, -0.231263,
-0.227353, -0.22052, -0.213042, -0.202084, -0.191173, -0.176828,
-0.158778, -0.138877, -0.116335, -0.092598, -0.065483, -0.038687,
-0.010695, 0.016711, 0.044115, 0.072605, 0.099908, 0.127516,
0.154357, 0.179351, 0.204669, 0.228667, 0.25158, 0.272296, 0.291927,
0.309133, 0.322194, 0.332866, 0.340746, 0.347923, 0.352051, 0.356205,
0.360027, 0.367622, 0.373352, 0.383057, 0.395343, 0.409964, 0.429023,
0.448791, 0.471514, 0.494944, 0.519479, 0.545365, 0.572908, 0.599505,
0.629952), Z = c("{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":2.32564,\"dist\":0.00353785}",
"", "", "", "", "", "", "", "", "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":3.80562,\"dist\":0.00674181}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":4.30772,\"dist\":0.00873242}",
"", "", "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":5.1461,\"dist\":0.0137308}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":5.64915,\"dist\":0.0178199}",
"", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":5.96298,\"dist\":0.020795}",
"", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":6.37879,\"dist\":0.0252221}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":6.87991,\"dist\":0.0312599}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":7.39267,\"dist\":0.038172}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":7.89454,\"dist\":0.0454437},{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":7.91958,\"dist\":0.0458214}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":8.42293,\"dist\":0.0536579}",
"", "", "", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":9.2023,\"dist\":0.0661186}",
"", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE1\",\"time\":9.70275,\"dist\":0.0742744}",
"", "", "", "", "", "{\"type\":\"M\",\"msg\":\"VALUE2\",\"time\":10.356,\"dist\":0.0850383}",
"")), row.names = c(NA, 80L), class = "data.frame")
For some reason with the script I only get rows with either VALUE1 or VALUE2 and the rows with "NA" are filtered out?
Upvotes: 0
Views: 689
Reputation: 7626
This might be a roundabout way of doing it but should work. It extracts the value to test for, fills down to 'group' all following rows by that value, then notes which one have a value in Z which matches the value inherited from above and filters out these rows:
library(tidyverse)
df <- tribble(~weight, ~height, ~Z,
62, 100, NA,
65, 89 , NA,
59, 88 , "randomnumbersVALUE1randomtext",
66, 92 , NA,
64, 90 , NA,
64, 87 , "randomnumbersVALUE2randomtext",
57, 84 , NA,
68, 99 , "randomnumbersVALUE2randomtext",
59, 82 , NA,
60, 87 , "srebmunmodnarVALUE1txetmodnar",
61, 86 , NA,
63, 84 , "srebmunmodnarVALUE2txetmodnar",
56, 79 , "srebmunmodnarVALUE2txetmodnar")
df |>
mutate(value = str_extract(Z, "VALUE\\d")) |>
fill(value, .direction = "down") |>
mutate(new = value != lag(value)) |>
filter(new | is.na(Z) | is.na(new)) |>
select(-value, -new)
#> # A tibble: 11 × 3
#> weight height Z
#> <dbl> <dbl> <chr>
#> 1 62 100 <NA>
#> 2 65 89 <NA>
#> 3 59 88 randomnumbersVALUE1randomtext
#> 4 66 92 <NA>
#> 5 64 90 <NA>
#> 6 64 87 randomnumbersVALUE2randomtext
#> 7 57 84 <NA>
#> 8 59 82 <NA>
#> 9 60 87 srebmunmodnarVALUE1txetmodnar
#> 10 61 86 <NA>
#> 11 63 84 srebmunmodnarVALUE2txetmodnar
Created on 2022-05-06 by the reprex package (v2.0.1)
Upvotes: 2