Reputation: 592
I have a time series as a data frame as follows:
date country value id
1/1/2020 A .2 Cv
1/2/2020 A 0 Cv
1/3/2020 A 0 Cv
..... ... .... ...
2/10/2020 A 2 ...
2/11/2020 A 0 Cv
3/11/2020. A 0 Cv
4/11/2020 A 0 Cv
5/11/2020 A 3 Cv
6/11/2020 A 4 Cv
7/11/2020 A 6 Cv
8/11/2020 A 7 Cv
I want to remove all values before the last sequence of zero that we have in the data frame: I tried following code:
test <- ddply(df,.(id),function(x){
temp_country_data <- ddply(x, .(country),function(y){
temp_data <- data.frame(y) %>% arrange(date) %>% group_by("id","country")
dat<-temp_data
ToRemove <- apply(dat, 2, function(colmn) {
row.zeros <- which(colmn == 0) # rows with zeros
if(length(row.zeros) > 0) { # if we found any
# which of them is the last double
last.doubles <- max(which(diff(row.zeros) == 1))
leftof.last.doubles <- "if"(length(last.doubles) > 0, # if double exists1:(row.zeros[last.doubles]-1), # all rows before
NULL) # else nothing
# remove rows with single zeros and all rows before double consecutive
unique(c(row.zeros, leftof.last.doubles)) }
temp_data<-dat[-unlist(ToRemove),]
temp_data = temp_data[,c("date", "id", "country", "value")]
temp_data
}, .parallel = T)
temp_country_data
}, .progress = 'text')
However it removes only zero values which i do not want. I want that output be like as follows: Also i want to interval 2 days after the final sequence of zeros:
7/11/2020 A 6 Cv
8/11/2020 A 7 Cv
......
Also I tried this but still I am not getting the result:
test3 <- ddply(df,.(id),function(x){
temp_country_data <- ddply(x, .(country),function(y){
temp_data <- data.frame(y) %>% arrange(date) %>% group_by("id","country")
temp_data<- temp_data%>% mutate(flag_0 = ifelse(value == 0,1,0),flag_0_cum = cumsum(flag_0)) %>%
filter(flag_0_cum == max(flag_0_cum)) %>%
filter(round(value,3) != 0) %>%
select(-flag_0 , -flag_0_cum) %>%
slice(3:n())
temp_data = temp_data[,c("date", "short_id", "country", "raw_de")]
library(lubridate)
temp_data <- temp_data %>%
group_by(country, id) %>%
mutate(DATE = ymd(date),
day_flag = if_else(DATE == (lag(DATE) + days(1)), 1, 0))
temp_data<- temp_data %>% filter(!is.na(day_flag))
temp_data<- temp_data %>%
mutate(flag_0 = ifelse(day_flag == 0,1,0),
flag_0_cum = cumsum(flag_0)) %>%
filter(flag_0_cum == max(flag_0_cum)) %>%
filter(day_flag != 0) %>%
select(-flag_0 , -flag_0_cum) %>%
slice(3:n())
temp_data
}, .parallel = T)
temp_country_data
}, .progress = 'text')
I have added another column into my dataframe to flag the consecutive rows as 1 and not consecutive as zero .
could you please let me know where is the problem.
Upvotes: 0
Views: 271
Reputation: 1724
One way would be to label the occurrence of zeros with a cumulative sum (each time a new zero occurs the label is increased by one) and keep only the last group which includes the last zero and all subsequent non-zero values. Then we can remove the zero itself and the first 2 rows:
library(dplyr)
df <- data.frame(date = seq.Date(from = as.Date("2020-11-01"),to = as.Date("2020-11-08"), by = "day"),
country = "A",
value = c(2,0,0,0,3,4,6,7),
id = "Cv")
df %>%
mutate(flag_0 = ifelse(round(value, 4) == 0, 1, 0),
flag_0_cum = cumsum(flag_0)) %>%
filter(flag_0_cum == max(flag_0_cum)) %>%
filter(round(value, 4) != 0) %>%
select(-flag_0 , -flag_0_cum) %>%
slice(3:n())
Upvotes: 1
Reputation: 73512
You could use diff
to identify zeroes and cumsum
to create a new index ix
. Then exclude zero rows and apply head
with value 2
in a by
to get every first two rows after a zero. Finally rbind
.
DF1 <- as.data.frame(cbind(DF, ix=cumsum(c(-1, diff(DF$value == 0)) %in% 1)))
DF1 <- DF1[DF1$value != 0, ]
res <- do.call(rbind, by(DF1, DF1$ix, head, 2))
head(res)
# date country value id ix
# 0 2020-01-01 A 3.0 Cv 0
# 1.3 2020-01-03 A 3.0 Cv 1
# 1.4 2020-01-04 A 2.0 Cv 1
# 2.6 2020-01-06 A 3.0 Cv 2
# 2.7 2020-01-07 A 0.2 Cv 2
# 3 2020-01-10 A 4.0 Cv 3
Data:
DF <- structure(list(date = structure(c(18262, 18263, 18264, 18265,
18266, 18267, 18268, 18269, 18270, 18271, 18272, 18273, 18274,
18275, 18276, 18277, 18278, 18279, 18280, 18281, 18282, 18283,
18284, 18285, 18286, 18287, 18288, 18289, 18290, 18291, 18292,
18293, 18294, 18295, 18296, 18297, 18298, 18299, 18300, 18301,
18302, 18303, 18304, 18305, 18306, 18307, 18308, 18309, 18310,
18311, 18312, 18313, 18314, 18315, 18316, 18317, 18318, 18319,
18320, 18321, 18322, 18323, 18324, 18325, 18326, 18327, 18328,
18329, 18330, 18331, 18332, 18333, 18334, 18335, 18336, 18337,
18338, 18339, 18340, 18341, 18342, 18343, 18344, 18345, 18346,
18347, 18348, 18349, 18350, 18351, 18352, 18353, 18354, 18355,
18356, 18357, 18358, 18359, 18360, 18361, 18362, 18363, 18364,
18365, 18366, 18367, 18368, 18369, 18370, 18371, 18372, 18373,
18374, 18375, 18376, 18377, 18378, 18379, 18380, 18381, 18382,
18383, 18384, 18385, 18386, 18387, 18388, 18389, 18390, 18391,
18392, 18393, 18394, 18395, 18396, 18397, 18398, 18399, 18400,
18401, 18402, 18403, 18404, 18405, 18406, 18407, 18408, 18409,
18410, 18411, 18412, 18413, 18414, 18415, 18416, 18417, 18418,
18419, 18420, 18421, 18422, 18423, 18424, 18425, 18426, 18427,
18428, 18429, 18430, 18431, 18432, 18433, 18434, 18435, 18436,
18437, 18438, 18439, 18440, 18441, 18442, 18443, 18444, 18445,
18446, 18447, 18448, 18449, 18450, 18451, 18452, 18453, 18454,
18455, 18456, 18457, 18458, 18459, 18460, 18461, 18462, 18463,
18464, 18465, 18466, 18467, 18468, 18469, 18470, 18471, 18472,
18473, 18474, 18475, 18476, 18477, 18478, 18479, 18480, 18481,
18482, 18483, 18484, 18485, 18486, 18487, 18488, 18489, 18490,
18491, 18492, 18493, 18494, 18495, 18496, 18497, 18498, 18499,
18500, 18501, 18502, 18503, 18504, 18505, 18506, 18507, 18508,
18509, 18510, 18511, 18512, 18513, 18514, 18515, 18516, 18517,
18518, 18519, 18520, 18521, 18522, 18523, 18524, 18525, 18526,
18527, 18528, 18529, 18530, 18531, 18532, 18533, 18534, 18535,
18536, 18537, 18538, 18539, 18540, 18541, 18542, 18543, 18544,
18545, 18546, 18547, 18548, 18549, 18550, 18551, 18552, 18553,
18554, 18555, 18556, 18557, 18558, 18559, 18560, 18561, 18562,
18563, 18564, 18565, 18566, 18567, 18568, 18569, 18570, 18571,
18572), class = "Date"), country = c("A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A"), value = c(3, 0, 3, 2, 0,
3, 0.2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 2, 3, 0, 0.2, 0.2,
3, 2, 0.2, 0, 2, 0.2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0.2, 0,
0.2, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 3, 0.2, 0.2,
4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0.2, 2, 0, 2, 0, 0, 2, 0,
0, 2, 4, 0, 2, 3, 0.2, 0, 0.2, 0, 0, 0, 4, 2, 0, 0, 2, 0.2, 3,
0, 0, 3, 0, 0, 3, 4, 3, 0.2, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0,
2, 0.2, 4, 4, 2, 0.2, 0, 3, 0.2, 0.2, 0, 0, 0, 2, 4, 0.2, 0,
4, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0.2, 0, 3, 0, 0.2, 0,
0.2, 0, 0, 0.2, 0, 3, 4, 0.2, 0, 0, 0, 0, 0.2, 0, 0, 0, 4, 0,
0, 0, 0, 0, 2, 2, 0, 0.2, 0, 0, 4, 0, 0, 0, 2, 2, 3, 0.2, 3,
2, 0, 0, 4, 0, 0, 0.2, 0.2, 4, 0, 2, 4, 3, 0, 2, 4, 2, 2, 0,
0, 0, 0, 0, 0.2, 0, 0, 0, 2, 0, 0, 4, 0, 0, 2, 0.2, 0, 0.2, 0,
2, 0, 0, 0, 2, 0, 0, 3, 3, 0, 0, 0.2, 0, 0, 0.2, 0, 0, 3, 0,
0.2, 0, 3, 0, 0, 0, 0, 0, 0, 3, 4, 3, 0, 2, 0, 0, 0, 0, 3, 4,
3, 0, 0.2, 0, 4, 4, 0, 4, 0, 3, 0, 2, 0, 0, 0, 0, 0.2, 0, 3,
3, 3, 4, 0, 0, 0, 4, 3, 0, 4, 0, 0, 0.2, 0, 0, 0.2, 0, 3, 3,
0, 0), id = c("Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv", "Cv",
"Cv", "Cv", "Cv", "Cv", "Cv", "Cv")), class = "data.frame", row.names = c(NA,
-311L))
Upvotes: 0