Reputation: 323
I am trying to remove zeros after the first instance of a zero, when all future values are 0. Eventually I would love to do this group_by species but baby steps. Here's an example;
# Sample
library(tidyverse)
id<-c("a","b","c","d","e","f","g","h","i","j")
time<-c(1,2,3,4,5,6,7,8,9,10)
value<-c(90, 50, 40, 0, 30, 30, 0, 10, 0, 0)
df<-data.frame(id, time, value)
df
id time value
1 a 1 90
2 b 2 50
3 c 3 40
4 d 4 0
5 e 5 30
6 f 6 30
7 g 7 0
8 h 8 10
9 i 9 0
10 j 10 0
I would like to see observation id "j" and only observation id "j" removed. I am not even sure where to start. Any suggestions are much appreciated!
Upvotes: 0
Views: 138
Reputation: 27732
Tidyverse solution that also works with groups
based on sample data (without grouping) code can be shortened, but this looks very readable ;-)
df %>%
#arrange by id
arrange( id ) %>%
#no grouping valiable in sample data.. so don't use group_by here
#group_by( group) %>%
#create dummy's: position in group, last value of group, position of last non-zero in group, previous value (within group)
mutate( pos_in_group = 1:n() ) %>%
mutate( last_value = last( value ) ) %>%
mutate( pos_last_not_zero = max( which( value != 0) ) ) %>%
mutate( prev_value = lag( value ) ) %>%
#filter all rows where:
# the last value of the group != 0 AND
# the previous row (within the group) != 0 AND
# the position of the row is 'below' the last non-zero measurement (in the group)
filter( !(last_value == 0 & prev_value == 0 & pos_in_group >= pos_last_not_zero + 1 ) ) %>%
#throw away the dummy's
select( -c( pos_in_group, last_value, pos_last_not_zero, prev_value ) )
# id time value
# 1 a 1 90
# 2 b 2 50
# 3 c 3 40
# 4 d 4 0
# 5 e 5 30
# 6 f 6 30
# 7 g 7 0
# 8 h 8 10
# 9 i 9 0
Example with some grouping involved
# Sample
library(tidyverse)
id<-c("a","b","c","d","e","f","g","h","i","j","k")
group<-c(1,1,1,1,1,1,2,2,2,2,2)
time<-c(1,2,3,4,5,6,7,8,9,10,11)
value = c(90,0,0,40,0,0,30,30,0,0,0)
df<-data.frame(id, group, time, value)
df
# id group time value
# 1 a 1 1 90
# 2 b 1 2 0
# 3 c 1 3 0
# 4 d 1 4 40
# 5 e 1 5 0
# 6 f 1 6 0
# 7 g 2 7 30
# 8 h 2 8 30
# 9 i 2 9 0
# 10 j 2 10 0
# 11 k 2 11 0
df %>%
#arrange by id
arrange( id ) %>%
#group
group_by( group) %>%
#create dummy's: position in group, last value of group, position of last non-zero in group, previous value (within group)
mutate( pos_in_group = 1:n() ) %>%
mutate( last_value = last( value ) ) %>%
mutate( pos_last_not_zero = max( which( value != 0) ) ) %>%
mutate( prev_value = lag( value ) ) %>%
#filter all rows where:
# the last value of the group != 0 AND
# the previous row (within the group) != 0 AND
# the position of the row is 'below' the last non-zero measurement (in the group)
filter( !(last_value == 0 & prev_value == 0 & pos_in_group >= pos_last_not_zero + 1 ) ) %>%
#throuw away the dummy's
select( -c( pos_in_group, last_value, pos_last_not_zero, prev_value ) )
# # A tibble: 8 x 4
# # Groups: group [2]
# id group time value
# <fct> <dbl> <dbl> <dbl>
# 1 a 1 1 90
# 2 b 1 2 0
# 3 c 1 3 0
# 4 d 1 4 40
# 5 e 1 5 0
# 6 g 2 7 30
# 7 h 2 8 30
# 8 i 2 9 0
Upvotes: 0
Reputation: 76402
In base R only.It uses rle
to get the number of trailing zeros, if any. Then subsets the dataframe with head
.
r <- rle(df$value == 0)
if(r$values[length(r$values)]) head(df, -(r$lengths[length(r$values)] - 1))
# id time value
#1 a 1 90
#2 b 2 50
#3 c 3 40
#4 d 4 0
#5 e 5 30
#6 f 6 30
#7 g 7 0
#8 h 8 10
#9 i 9 0
You can write a function with the code above, and maybe *apply
it to groups.
trailingZeros <- function(DF, col = "value"){
r <- rle(DF[[col]] == 0)
if(r$values[length(r$values)] && r$lengths[length(r$values)] > 1)
head(DF, -(r$lengths[length(r$values)] - 1))
else
DF
}
trailingZeros(df)
Note that this also works with a larger number of trailing zeros.
id2 <- c("a","b","c","d","e","f","g","h","i","j","k")
time2 <- c(1,2,3,4,5,6,7,8,9,10,11)
value2 <- c(90, 50, 40, 0, 30, 30, 0, 10, 0, 0, 0) # One more zero at end
df2 <- data.frame(id = id2, time = time2, value = value2)
trailingZeros(df2)
Upvotes: 1
Reputation: 11981
here is a solution within the tidyverse which also works on a larger number of trailing zeros:
df <- tibble(id = letters[1:11], time = 1:11,
value = c(90,50,40,0,30,30,0,10,0,0,0))
df %>%
slice(n():1) %>%
slice(c(which(cumsum(value > 0) > 0)[1] - 1, which(cumsum(value > 0) > 0))) %>%
slice(n():1)
Upvotes: 0