Reputation: 249
I have a df below:
d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction", "Correction", "Deficiency", "Deficiency", "Correction"), Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00", "2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00", "2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49", "2020-09-02 00:03:54","2020-09-01 00:04:48" ), CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123")), class = "data.frame", row.names = c(NA, -19L))
d<-d%>%arrange(CaseNo,Submissiondate)
I have a code like the one shown below:
d <- d %>% mutate(Submissiondate = as.Date(Submissiondate),
weekday = wday(Submissiondate),
week.end = Submissiondate + 7 - weekday)
#1 End state for each case and week
EndStates <- d %>% group_by(CaseNo, week.end) %>% summarize(WeekEndState = last(ReviewType)) %>% ungroup()
#2 Get unique values of Case/Week.end/ReviewType
chk <- d %>%
select(CaseNo, week.end, ReviewType) %>%
distinct()
#3 Add the EndStates and count if the week had a correction AND the week ended as a Correction
chk <- left_join(chk, EndStates, by = c("CaseNo", "week.end"))
chk %>%
group_by(week.end, .drop = FALSE) %>%
filter(WeekEndState == "Correction", ReviewType == "Correction") %>%
summarize(n = n()) %>%
ungroup()
This code is trying to see for each case number, what was its last status each week and only filtering for correction
The output for the code is shown below:
week.end n
<date> <int>
1 2020-08-22 0
2 2020-08-29 1
3 2020-09-05 1
4 2020-10-03 1
However, i want to tweak this in a way that as long as the status for a case stayed at correction, it should be counted in the stats for all the subsequent weeks so in my df, i want the below output:
week.end n
<date> <int>
1 2020-08-22 0
2 2020-08-29 1
3 2020-09-05 2
4 2020-10-03 2
It's 2 because case 121 stayed as correction and there was no subsequent update. Is this possible?
Edit: A new df
d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction",
"Correction", "Deficiency", "Deficiency", "Correction","Correction","Deficiency","Correction"),
Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00",
"2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00",
"2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49",
"2020-09-02 00:03:54","2020-09-01 00:04:48","2020-10-01 04:31:00","2020-10-11 04:31:00","2020-10-21 04:31:00"),
CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123","127","127","127")), class = "data.frame", row.names = c(NA, -22L))
d<-d%>%arrange(CaseNo,Submissiondate)
Upvotes: 0
Views: 104
Reputation: 68
The following code do what you need:
# create a data frame with first week.end Date that a correction appears for
# a CaseNo, and the day this correction is removed
correction_df <- as.data.frame(matrix(ncol=3, nrow=length(unique(chk$CaseNo))))
names(correction_df) <- c("CaseNo", "start", "finish")
for (i in 1:length(unique(chk$CaseNo))){
n = unique(chk$CaseNo)[i]
tmp = chk[chk$CaseNo==n,]
correction_start = min(tmp$week.end[tmp$WeekEndState == "Correction"])
correction_finish = min(tmp$week.end[tmp$WeekEndState != "Correction" &
tmp$week.end > correction_start])
correction_df[i, ] <- c(n, correction_start,
correction_finish)
}
# count the number of correction cases "active" in each date in the data frame
asw = as.data.frame(matrix(ncol=2, nrow=length(unique(chk$week.end))))
names(asw) <- c("week.end", "n")
asw$week.end <- sort(unique(chk$week.end))
for (i in 1:length(unique(chk$week.end))){
aDate = sort(unique(chk$week.end))[i]
# n is the number of new correction cases in the week.end, minus the
# number of cases that changed their status from correction to something else
n = sum(as.Date(as.numeric(correction_df$start),
origin="1970-01-01")==aDate, na.rm=T) -
sum(as.Date(as.numeric(correction_df$finish),
origin="1970-01-01")==aDate, na.rm=T)
# finally, sum the accumulated cases with n
asw$n[i] <- ifelse(i == 1, n, n+asw$n[i-1])
}
asw
Edit: another solution that considers the problem pointed out in the comment
# a data frame with dummies indicating if for a given week.end, the case
# refered in the col name ended as "Correction"
cor_df <- as.data.frame(matrix(ncol=length(unique(chk$CaseNo))+1,
nrow=length(unique(chk$week.end))))
names(cor_df) <- c("week.end", unique(chk$CaseNo))
cor_df$week.end <- unique(chk$week.end)
for(i in 1:nrow(cor_df)){
for(j in 2:ncol(cor_df)){
this_CaseNo_idx = chk$CaseNo == strtoi(names(cor_df)[j])
idx = this_CaseNo_idx &
chk$week.end == max(chk$week.end[chk$week.end <= cor_df$week.end[i] &
this_CaseNo_idx])
if (sum(idx) < 1){
cor_df[i, j] = 0
}
else{
# The effect of the "max" here is that for CaseNo with duplicated
# records in the same day, it will always consider as it ended as
# "correction". To avoid this, change the code so only the latest
# record appear for each week.end in the chk data.frame
cor_df[i, j] = max(ifelse(chk$WeekEndState[idx] == "Correction" &
chk$ReviewType[idx] == "Correction",
1, 0))
}
}
}
cor_df$asw <- rowSums(cor_df[,2:ncol(cor_df)])
cor_df <- cor_df[order(cor_df$week.end),]
data.frame(week.end=cor_df$week.end, cor_df$asw)
Upvotes: 1