Reputation: 347
I have a subscription data frame looks like the following. There are about 1 million unique IDs. The table lists subscription status. When user began subscription service, status field is indicated by 'Sub' and when user unsubscribed, it's indicated by 'Usub'.
dat <- data.frame(ID=c(rep("A",12),(rep("B",12))), Year="2014", Month=rep(seq(1:12),2), Status=NA)
dat$Status[4] <- "Sub"
dat$Status[8] <- "Usub"
dat$Status[17] <- "Usub"
dat$Status[21] <- "Sub"
ID Year Month Status
A 2014 1
A 2014 2
A 2014 3
A 2014 4 Sub
A 2014 5
A 2014 6
A 2014 7
A 2014 8 Usub
A 2014 9
A 2014 10
A 2014 11
A 2014 12
B 2014 1
B 2014 2
B 2014 3
B 2014 4
B 2014 5 Usub
B 2014 6
B 2014 7
B 2014 8
B 2014 9 Sub
B 2014 10
B 2014 11
B 2014 12
C 2014 1 .
. . . .
. . . .
I am looking to fill in the gap between each status updates. The desired output table would look like the following:
ID Year Month Status
A 2014 1 Usub
A 2014 2 Usub
A 2014 3 Usub
A 2014 4 Sub
A 2014 5 Sub
A 2014 6 Sub
A 2014 7 Sub
A 2014 8 Usub
A 2014 9 Usub
A 2014 10 Usub
A 2014 11 Usub
A 2014 12 Usub
B 2014 1 Sub
B 2014 2 Sub
B 2014 3 Sub
B 2014 4 Sub
B 2014 5 Usub
B 2014 6 Usub
B 2014 7 Usub
B 2014 8 Usub
B 2014 9 Sub
B 2014 10 Sub
B 2014 11 Sub
B 2014 12 Sub
C 2014 1 .
. . . .
. . . .
Each ID has at least one status value. If an ID's first status record is "Usub", then all prior months' status is "Sub". (Like ID B on 2014/05) On the contrary, if the first status record start with "Sub", all prior months' status is "Usub"
Upvotes: 3
Views: 1075
Reputation: 887991
Another option would be to convert the blank ""
to NA
and make use of na.locf
from zoo
package to replace the NA with the non-NA previous element. As this is a group by operation, we can also do this with ave
from base R
.
library(zoo)
df$Status <- with(df, ave(replace(Status, !nzchar(Status), NA), ID,
FUN = function(x){ x1 <- na.locf(x, na.rm=FALSE)
replace(x1, is.na(x1), setdiff(unique(na.omit(x1)), x1[!is.na(x1)][1]))}))
df$Status
#[1] "Usub" "Usub" "Usub" "Sub" "Sub" "Sub" "Sub" "Usub" "Usub" "Usub" "Usub" "Usub" "Sub" "Sub" "Sub" "Sub" "Usub" "Usub" "Usub"
#[20] "Usub" "Sub" "Sub" "Sub" "Sub"
Upvotes: 2
Reputation: 642
uniquevector<-unique(dat$ID)
for(i in uniquevector){
zzz <- which(dat$ID==i & dat$Status == "Sub")
zzz2 <- which(dat$ID==i & dat$Status == "Usub")
zzz3 <- which(dat$ID==i & dat$Month == 12)
zzz4 <- which(dat$ID==i & dat$Month == 1)
if(zzz2 > zzz){
index<-zzz:(zzz2-1)
dat$Status[index] <- "Sub"
}
if(zzz2 < zzz){
index<-zzz2:(zzz-1)
dat$Status[index] <- "Usub"
}
if(zzz3 > zzz2 & zzz < zzz2){
index<-zzz2:zzz3
dat$Status[index] <- "Usub"
}
if(zzz2 < zzz & zzz3 > zzz){
index<-zzz:zzz3
dat$Status[index] <- "Sub"
if((zzz4 < zzz) & zzz < zzz2){
index<-zzz4:(zzz-1)
dat$Status[index] <- "Usub"
}
if((zzz4 < zzz2) & zzz2 < zzz){
index<-zzz4:(zzz2-1)
dat$Status[index] <- "Sub"
}
}}
Upvotes: 0
Reputation: 215137
You can generate an alternating sequence which is equivalent to your desired Status column by subtracting Status == "Usub"
and Status = "Sub"
in which way all positions that should be filled with Sub
will have a lower value than those that should be filled with Usub
and then use the fact that factor can be labelled in an ordered way to transform the integer sequence to a factor:
library(dplyr)
df %>% group_by(ID) %>% mutate(Status = factor(cumsum((Status == "Usub") - (Status == "Sub")),
labels = c("Sub", "Usub")))
# ID Year Month Status
# 1 A 2014 1 Usub
# 2 A 2014 2 Usub
# 3 A 2014 3 Usub
# 4 A 2014 4 Sub
# 5 A 2014 5 Sub
# 6 A 2014 6 Sub
# 7 A 2014 7 Sub
# 8 A 2014 8 Usub
# 9 A 2014 9 Usub
# 10 A 2014 10 Usub
# 11 A 2014 11 Usub
# 12 A 2014 12 Usub
# 13 B 2014 1 Sub
# 14 B 2014 2 Sub
# 15 B 2014 3 Sub
# 16 B 2014 4 Sub
# 17 B 2014 5 Usub
# 18 B 2014 6 Usub
# 19 B 2014 7 Usub
# 20 B 2014 8 Usub
# 21 B 2014 9 Sub
# 22 B 2014 10 Sub
# 23 B 2014 11 Sub
# 24 B 2014 12 Sub
A corresponding data.table
way would be:
library(data.table)
setDT(df)[, Status := as.character(factor(cumsum((Status == "Usub") - (Status == "Sub")), labels = c("Sub", "Usub"))), .(ID)]
where you have to convert the new factor back to character class since it doesn't allow the type change when creating a new column in place.
The data assumes you have empty string instead of NA
:
structure(list(ID = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B"), Year = c("2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014"), Month = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12", "1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12"), Status = c("", "", "", "Sub",
"", "", "", "Usub", "", "", "", "", "", "", "", "", "Usub", "",
"", "", "Sub", "", "", "")), .Names = c("ID", "Year", "Month",
"Status"), row.names = c(NA, 24L), class = "data.frame")
Upvotes: 3