Can I tweak this code to get what I want?

Question

I have a df below:

d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction", "Correction", "Deficiency", "Deficiency", "Correction"), Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00", "2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00", "2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49", "2020-09-02 00:03:54","2020-09-01 00:04:48" ), CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123")), class = "data.frame", row.names = c(NA, -19L))

d<-d%>%arrange(CaseNo,Submissiondate)

I have a code like the one shown below:

d <- d %>% mutate(Submissiondate = as.Date(Submissiondate), 
                  weekday = wday(Submissiondate), 
                  week.end = Submissiondate + 7 - weekday) 


#1 End state for each case and week
EndStates <- d %>% group_by(CaseNo, week.end) %>% summarize(WeekEndState = last(ReviewType)) %>% ungroup()

#2 Get unique values of Case/Week.end/ReviewType
chk <- d %>%
  select(CaseNo, week.end, ReviewType) %>%
  distinct()

#3 Add the EndStates and count if the week had a correction AND the week ended as a Correction
chk <- left_join(chk, EndStates, by = c("CaseNo", "week.end"))
chk %>% 
  group_by(week.end, .drop = FALSE) %>%
  filter(WeekEndState == "Correction", ReviewType == "Correction") %>%
  summarize(n = n()) %>%
  ungroup()

This code is trying to see for each case number, what was its last status each week and only filtering for correction

The output for the code is shown below:

week.end       n
       
1 2020-08-22     0
2 2020-08-29     1
3 2020-09-05     1
4 2020-10-03     1

However, i want to tweak this in a way that as long as the status for a case stayed at correction, it should be counted in the stats for all the subsequent weeks so in my df, i want the below output:

week.end       n
       
1 2020-08-22     0
2 2020-08-29     1
3 2020-09-05     2
4 2020-10-03     2

It's 2 because case 121 stayed as correction and there was no subsequent update. Is this possible?

Edit: A new df


d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction", 
                                "Correction", "Deficiency", "Deficiency", "Correction","Correction","Deficiency","Correction"),
                  Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00",
                                    "2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00",
                                    "2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49",
                                    "2020-09-02 00:03:54","2020-09-01 00:04:48","2020-10-01 04:31:00","2020-10-11 04:31:00","2020-10-21 04:31:00"),
                  CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123","127","127","127")), class = "data.frame", row.names = c(NA, -22L))


d<-d%>%arrange(CaseNo,Submissiondate)

Shinji FF · Accepted Answer

The following code do what you need:

# create a data frame with first week.end Date that a correction appears for 
# a CaseNo, and the day this correction is removed
correction_df <- as.data.frame(matrix(ncol=3, nrow=length(unique(chk$CaseNo))))
names(correction_df) <- c("CaseNo", "start", "finish")
for (i in 1:length(unique(chk$CaseNo))){
      n = unique(chk$CaseNo)[i]
      tmp = chk[chk$CaseNo==n,]
      correction_start = min(tmp$week.end[tmp$WeekEndState == "Correction"])
      correction_finish = min(tmp$week.end[tmp$WeekEndState != "Correction" &
                                                 tmp$week.end > correction_start])
      correction_df[i, ] <- c(n, correction_start,
                              correction_finish)
}

# count the number of correction cases "active" in each date in the data frame
asw = as.data.frame(matrix(ncol=2, nrow=length(unique(chk$week.end))))
names(asw) <- c("week.end", "n")
asw$week.end <- sort(unique(chk$week.end))
for (i in 1:length(unique(chk$week.end))){
      aDate = sort(unique(chk$week.end))[i]
      # n is the number of new correction cases in the week.end, minus the 
      # number of cases that changed their status from correction to something else
      n = sum(as.Date(as.numeric(correction_df$start),
                      origin="1970-01-01")==aDate, na.rm=T) - 
            sum(as.Date(as.numeric(correction_df$finish),
                        origin="1970-01-01")==aDate, na.rm=T)
      # finally, sum the accumulated cases with n
      asw$n[i] <- ifelse(i == 1, n, n+asw$n[i-1])
}
asw

Edit: another solution that considers the problem pointed out in the comment

# a data frame with dummies indicating if for a given week.end, the case
# refered in the col name ended as "Correction"
cor_df <- as.data.frame(matrix(ncol=length(unique(chk$CaseNo))+1, 
                               nrow=length(unique(chk$week.end))))
names(cor_df) <- c("week.end", unique(chk$CaseNo))
cor_df$week.end <- unique(chk$week.end)
for(i in 1:nrow(cor_df)){
      for(j in 2:ncol(cor_df)){
            this_CaseNo_idx = chk$CaseNo == strtoi(names(cor_df)[j])
            idx =  this_CaseNo_idx & 
                  chk$week.end == max(chk$week.end[chk$week.end <= cor_df$week.end[i] &
                                                         this_CaseNo_idx])
            if (sum(idx) < 1){
                  cor_df[i, j] = 0
            }
            else{
                  # The effect of the "max" here is that for CaseNo with duplicated
                  # records in the same day, it will always consider as it ended as
                  # "correction". To avoid this, change the code so only the latest
                  # record appear for each week.end in the chk data.frame
                  cor_df[i, j] = max(ifelse(chk$WeekEndState[idx] == "Correction" & 
                                              chk$ReviewType[idx] == "Correction", 
                                            1, 0))
            }
            
      }
}
cor_df$asw <- rowSums(cor_df[,2:ncol(cor_df)])
cor_df <- cor_df[order(cor_df$week.end),]
data.frame(week.end=cor_df$week.end, cor_df$asw)

Can I tweak this code to get what I want?

Answers (1)

Related Questions