user
user

Reputation: 592

Remove rows before consecutive zero values in time series Using R

I have a time series as a data frame as follows:

date          country      value       id
1/1/2020       A            .2         Cv
1/2/2020       A             0         Cv
1/3/2020       A             0         Cv 
.....         ...           ....       ...
2/10/2020      A              2        ...
2/11/2020      A              0        Cv
3/11/2020.     A              0        Cv
4/11/2020      A              0        Cv
5/11/2020      A              3        Cv
6/11/2020      A              4        Cv
7/11/2020      A              6        Cv
8/11/2020      A              7        Cv

I want to remove all values before the last sequence of zero that we have in the data frame: I tried following code:

test <- ddply(df,.(id),function(x){
  temp_country_data <- ddply(x, .(country),function(y){
    temp_data <- data.frame(y) %>% arrange(date) %>% group_by("id","country") 
dat<-temp_data
    ToRemove <- apply(dat, 2, function(colmn) {
      row.zeros <- which(colmn == 0) # rows with zeros
      if(length(row.zeros) > 0) { # if we found any
        # which of them is the last double
        last.doubles <- max(which(diff(row.zeros) == 1))
        leftof.last.doubles <- "if"(length(last.doubles) > 0, # if double exists1:(row.zeros[last.doubles]-1), # all rows before
                                    NULL) # else nothing
        # remove rows with single zeros and all rows before double consecutive 
        unique(c(row.zeros, leftof.last.doubles)) }
      temp_data<-dat[-unlist(ToRemove),]
    temp_data = temp_data[,c("date", "id", "country", "value")]
    temp_data
  }, .parallel = T)
  temp_country_data
}, .progress = 'text')

However it removes only zero values which i do not want. I want that output be like as follows: Also i want to interval 2 days after the final sequence of zeros:

7/11/2020      A              6        Cv
8/11/2020      A              7        Cv

......

Also I tried this but still I am not getting the result:

test3 <- ddply(df,.(id),function(x){
  temp_country_data <- ddply(x, .(country),function(y){
    temp_data <- data.frame(y) %>% arrange(date) %>% group_by("id","country") 
    temp_data<- temp_data%>% mutate(flag_0 = ifelse(value == 0,1,0),flag_0_cum = cumsum(flag_0)) %>% 
      filter(flag_0_cum == max(flag_0_cum)) %>% 
      filter(round(value,3) != 0) %>% 
      select(-flag_0 , -flag_0_cum) %>%
      slice(3:n())
    temp_data = temp_data[,c("date", "short_id", "country", "raw_de")]
    library(lubridate)
    
    temp_data <- temp_data %>% 
      group_by(country, id) %>%                          
      mutate(DATE = ymd(date), 
             day_flag = if_else(DATE == (lag(DATE) + days(1)), 1, 0))

temp_data<- temp_data %>% filter(!is.na(day_flag))
temp_data<- temp_data %>% 
  mutate(flag_0 = ifelse(day_flag == 0,1,0),
         flag_0_cum = cumsum(flag_0)) %>% 
  filter(flag_0_cum == max(flag_0_cum)) %>% 
  filter(day_flag != 0) %>% 
  select(-flag_0 , -flag_0_cum) %>%
  slice(3:n())

   temp_data
  }, .parallel = T)
  temp_country_data
}, .progress = 'text')

I have added another column into my dataframe to flag the consecutive rows as 1 and not consecutive as zero .

could you please let me know where is the problem.

Upvotes: 0

Views: 271

Answers (2)

pieterbons
pieterbons

Reputation: 1724

One way would be to label the occurrence of zeros with a cumulative sum (each time a new zero occurs the label is increased by one) and keep only the last group which includes the last zero and all subsequent non-zero values. Then we can remove the zero itself and the first 2 rows:

library(dplyr) 

df <- data.frame(date = seq.Date(from = as.Date("2020-11-01"),to = as.Date("2020-11-08"), by = "day"),
                 country = "A",
                 value = c(2,0,0,0,3,4,6,7),
                 id = "Cv")

df %>% 
  mutate(flag_0 = ifelse(round(value, 4) == 0, 1, 0),
         flag_0_cum = cumsum(flag_0)) %>% 
  filter(flag_0_cum == max(flag_0_cum)) %>% 
  filter(round(value, 4) != 0) %>% 
  select(-flag_0 , -flag_0_cum) %>%
  slice(3:n())

Upvotes: 1

jay.sf
jay.sf

Reputation: 73512

You could use diff to identify zeroes and cumsum to create a new index ix. Then exclude zero rows and apply head with value 2 in a by to get every first two rows after a zero. Finally rbind.

DF1 <- as.data.frame(cbind(DF, ix=cumsum(c(-1, diff(DF$value == 0)) %in% 1)))
DF1 <- DF1[DF1$value != 0, ]
res <- do.call(rbind, by(DF1, DF1$ix, head, 2))
head(res)
#           date country value id ix
# 0   2020-01-01       A   3.0 Cv  0
# 1.3 2020-01-03       A   3.0 Cv  1
# 1.4 2020-01-04       A   2.0 Cv  1
# 2.6 2020-01-06       A   3.0 Cv  2
# 2.7 2020-01-07       A   0.2 Cv  2
# 3   2020-01-10       A   4.0 Cv  3

Data:

DF <- structure(list(date = structure(c(18262, 18263, 18264, 18265, 
18266, 18267, 18268, 18269, 18270, 18271, 18272, 18273, 18274, 
18275, 18276, 18277, 18278, 18279, 18280, 18281, 18282, 18283, 
18284, 18285, 18286, 18287, 18288, 18289, 18290, 18291, 18292, 
18293, 18294, 18295, 18296, 18297, 18298, 18299, 18300, 18301, 
18302, 18303, 18304, 18305, 18306, 18307, 18308, 18309, 18310, 
18311, 18312, 18313, 18314, 18315, 18316, 18317, 18318, 18319, 
18320, 18321, 18322, 18323, 18324, 18325, 18326, 18327, 18328, 
18329, 18330, 18331, 18332, 18333, 18334, 18335, 18336, 18337, 
18338, 18339, 18340, 18341, 18342, 18343, 18344, 18345, 18346, 
18347, 18348, 18349, 18350, 18351, 18352, 18353, 18354, 18355, 
18356, 18357, 18358, 18359, 18360, 18361, 18362, 18363, 18364, 
18365, 18366, 18367, 18368, 18369, 18370, 18371, 18372, 18373, 
18374, 18375, 18376, 18377, 18378, 18379, 18380, 18381, 18382, 
18383, 18384, 18385, 18386, 18387, 18388, 18389, 18390, 18391, 
18392, 18393, 18394, 18395, 18396, 18397, 18398, 18399, 18400, 
18401, 18402, 18403, 18404, 18405, 18406, 18407, 18408, 18409, 
18410, 18411, 18412, 18413, 18414, 18415, 18416, 18417, 18418, 
18419, 18420, 18421, 18422, 18423, 18424, 18425, 18426, 18427, 
18428, 18429, 18430, 18431, 18432, 18433, 18434, 18435, 18436, 
18437, 18438, 18439, 18440, 18441, 18442, 18443, 18444, 18445, 
18446, 18447, 18448, 18449, 18450, 18451, 18452, 18453, 18454, 
18455, 18456, 18457, 18458, 18459, 18460, 18461, 18462, 18463, 
18464, 18465, 18466, 18467, 18468, 18469, 18470, 18471, 18472, 
18473, 18474, 18475, 18476, 18477, 18478, 18479, 18480, 18481, 
18482, 18483, 18484, 18485, 18486, 18487, 18488, 18489, 18490, 
18491, 18492, 18493, 18494, 18495, 18496, 18497, 18498, 18499, 
18500, 18501, 18502, 18503, 18504, 18505, 18506, 18507, 18508, 
18509, 18510, 18511, 18512, 18513, 18514, 18515, 18516, 18517, 
18518, 18519, 18520, 18521, 18522, 18523, 18524, 18525, 18526, 
18527, 18528, 18529, 18530, 18531, 18532, 18533, 18534, 18535, 
18536, 18537, 18538, 18539, 18540, 18541, 18542, 18543, 18544, 
18545, 18546, 18547, 18548, 18549, 18550, 18551, 18552, 18553, 
18554, 18555, 18556, 18557, 18558, 18559, 18560, 18561, 18562, 
18563, 18564, 18565, 18566, 18567, 18568, 18569, 18570, 18571, 
18572), class = "Date"), country = c("A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A"), value = c(3, 0, 3, 2, 0, 
3, 0.2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 2, 3, 0, 0.2, 0.2, 
3, 2, 0.2, 0, 2, 0.2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0.2, 0, 
0.2, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 3, 0.2, 0.2, 
4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0.2, 2, 0, 2, 0, 0, 2, 0, 
0, 2, 4, 0, 2, 3, 0.2, 0, 0.2, 0, 0, 0, 4, 2, 0, 0, 2, 0.2, 3, 
0, 0, 3, 0, 0, 3, 4, 3, 0.2, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 
2, 0.2, 4, 4, 2, 0.2, 0, 3, 0.2, 0.2, 0, 0, 0, 2, 4, 0.2, 0, 
4, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0.2, 0, 3, 0, 0.2, 0, 
0.2, 0, 0, 0.2, 0, 3, 4, 0.2, 0, 0, 0, 0, 0.2, 0, 0, 0, 4, 0, 
0, 0, 0, 0, 2, 2, 0, 0.2, 0, 0, 4, 0, 0, 0, 2, 2, 3, 0.2, 3, 
2, 0, 0, 4, 0, 0, 0.2, 0.2, 4, 0, 2, 4, 3, 0, 2, 4, 2, 2, 0, 
0, 0, 0, 0, 0.2, 0, 0, 0, 2, 0, 0, 4, 0, 0, 2, 0.2, 0, 0.2, 0, 
2, 0, 0, 0, 2, 0, 0, 3, 3, 0, 0, 0.2, 0, 0, 0.2, 0, 0, 3, 0, 
0.2, 0, 3, 0, 0, 0, 0, 0, 0, 3, 4, 3, 0, 2, 0, 0, 0, 0, 3, 4, 
3, 0, 0.2, 0, 4, 4, 0, 4, 0, 3, 0, 2, 0, 0, 0, 0, 0.2, 0, 3, 
3, 3, 4, 0, 0, 0, 4, 3, 0, 4, 0, 0, 0.2, 0, 0, 0.2, 0, 3, 3, 
0, 0), id = c("Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", 
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv")), class = "data.frame", row.names = c(NA, 
-311L))

Upvotes: 0

Related Questions