Reputation: 41
I have a data.table which age column contain missing values and rdate is Date format. I want to replace missing age by finding the next non-missing age and rdate of each horsenum, then calculate the missing age by next non-missing age - ceiling year difference of non-missing rdate and this record' rdate. I assume next non-missing rdate is birthday so I use ceiling year difference. Also, I want to keep rdate.fill as Date format. How to write this in data.table code?
My idea of age.fill is calculate by this way, but I have error
library(lubridate)
data[, rdate.fill := ifelse(is.na(age), as.Date(rdate[na.lacf(age)]), NA), by=horsenum]
data[, age.fill := ifelse(is.na(age), ind4- ceiling(time_length(difftime(rdate.fill, rdate, "years"), age), by=horsenum]
input
index rdate horsenum age ind4
1: 14704 2009-03-01 K123 NA 10
2: 14767 2009-03-01 K212 NA 9
3: 39281 2011-10-09 K123 NA 10
4: 39561 2011-10-19 K212 NA 9
5: 74560 2015-04-07 K212 NA 9
6: 77972 2015-09-06 K123 10 NA
7: 79111 2015-10-10 K212 9 NA
8: 84233 2016-03-28 K212 10 NA
structure(list(index = c(14704L, 14767L, 39281L, 39561L, 74560L,
77972L, 79111L, 84233L), rdate = structure(c(14304, 14304, 15256,
15266, 16532, 16684, 16718, 16888), class = "Date"), horsenum = c("K123",
"K212", "K123", "K212", "K212", "K123", "K212", "K212"), age = c(NA,
NA, NA, NA, NA, 10, 9, 10), ind4 = c(10, 9, 10, 9, 9, NA, NA,
NA)), row.names = c(NA, -8L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x000002c5512f1ef0>)
output
index rdate horsenum age ind4 rdate.fill age.fill
1: 14704 2009-03-01 K123 NA 10 2015-09-06 3
2: 14767 2009-03-01 K212 NA 9 2015-10-10 2
3: 39281 2011-10-09 K123 NA 10 2015-09-06 6
4: 39561 2011-10-19 K212 NA 9 2015-10-10 5
5: 74560 2015-04-07 K212 NA 9 2015-10-10 8
6: 77972 2015-09-06 K123 10 NA 10
7: 79111 2015-10-10 K212 9 NA 9
8: 84233 2016-03-28 K212 10 NA 10
Upvotes: 1
Views: 256
Reputation: 27732
The approach below is slightly different:
It calculates, based on the given ages, the possible 'range' of the birthday from the horse. It then uses this window to calculate the minimum and maximum age a horse can haveon the given rdate
.
So, the more infor you have in te horse's age, the smaller the window of possible birthdays, and the bigger the chance that the minimum estimates age equals the maximum estimated age (of they are the same, you know the age of the horse for sure )..
Here we go:
library( data.table )
library( lubridate ) #for the %m+% and %m-% operators
library( intervals ) #to calculate with intervals and find overlaps
library( eeptools ) #for age_calc function; calculating the age, given a date and a birthday (respects leap yaers, etc..)
#read sample data
DT <- fread("
index rdate horsenum age ind4
14704 2009-03-01 K123 NA 10
14767 2009-03-01 K212 NA 9
39281 2011-10-09 K123 NA 10
39561 2011-10-19 K212 NA 9
74560 2015-04-07 K212 NA 9
77972 2015-09-06 K123 10 NA
79111 2015-10-10 K212 9 NA
84233 2016-03-28 K212 10 NA")
#set dates as IDate
DT[, rdate := as.POSIXct(rdate) ]
#set keys
setkey( DT, horsenum, rdate, age )
#calculate bandwidth date of birth (dob) based on age and date
DT[!is.na( age ), dob_min := as.integer( rdate %m-% lubridate::years(age + 1) %m+% lubridate::days(1) ) ]
DT[!is.na( age ), dob_max := as.integer( rdate %m-% lubridate::years(age) ) ]
#function to get get overlap of birthday-intervals
myfun <- function( y ) {
all_intervals <- intervals::Intervals( as.matrix( y ), check_valid = TRUE )
int_min <- all_intervals[1]
for (i in 1:nrow(all_intervals) ) int_min <- interval_intersection( all_intervals[1], all_intervals[i] )
as.data.table( int_min )
}
#get range of possible date of birth for each horsenum
dob_range <- DT[ !is.na(age), myfun( .SD ), by = .(horsenum), .SDcols = c("dob_min", "dob_max") ]
dob_range <- dob_range[, .(horsenum, dob_from = as.POSIXct(V1, origin = "1970-01-01"),
dob_to = as.POSIXct(V2, origin = "1970-01-01"))]
#use found ranges of birthday to estimate ages
#first join dob-ranges by horsenum
DT[ dob_range, `:=`( dob_from = i.dob_from, dob_to = i.dob_to), on = .(horsenum)]
#now calculate ages (minimum and maximum)
DT[, age_min := floor( eeptools::age_calc( as.Date(dob_to), as.Date(rdate), units= "years" ) )]
DT[, age_max := floor( eeptools::age_calc( as.Date(dob_from), as.Date(rdate), units= "years" ) )]
#remove helper columns
DT[, `:=`( dob_min = NULL, dob_max = NULL, dob_from = NULL, dob_to = NULL)]
# index rdate horsenum age ind4 age_min age_max
# 1: 14704 2009-03-01 K123 NA 10 3 4
# 2: 39281 2011-10-09 K123 NA 10 6 7
# 3: 77972 2015-09-06 K123 10 NA 10 10
# 4: 14767 2009-03-01 K212 NA 9 2 3
# 5: 39561 2011-10-19 K212 NA 9 5 6
# 6: 74560 2015-04-07 K212 NA 9 9 9
# 7: 79111 2015-10-10 K212 9 NA 9 9
# 8: 84233 2016-03-28 K212 10 NA 10 10
Upvotes: 1
Reputation: 4340
Your algorithm systematically underestimates age. For example, horse K212's estimated age on 2015-04-07 (row 5) is 8. However, we know K212's age on 2016-03-28 is 10 (row 8), so K212 must be 9 on 2015-04-07, not 8. Here I address this problem by calculating an estimated birthdate from each non-NA rdate, then calculating the earliest estimated birthdate for each horse.
library(data.table)
data=data.table(index=c(14704L,14767L,39281L,39561L,74560L,77972L,79111L,84233L),rdate=structure(c(14304,14304,15256,15266,16532,16684,16718,16888),class="Date"),horsenum=c("K123","K212","K123","K212","K212","K123","K212","K212"),age=c(NA,NA,NA,NA,NA,10,9,10))
lt = data[!is.na(age),as.POSIXlt(rdate)]
lt$year = lt$year - data[!is.na(age),age]
data[!is.na(age),bday:=as.Date(lt)]
data[,bday:=min(bday,na.rm=T),horsenum]
data[,age.fill:=floor(as.numeric(rdate-bday)/365)]
data[order(index)]
Output:
index rdate horsenum age bday age.fill
1: 14704 2009-03-01 K123 NA 2005-09-06 3
2: 14767 2009-03-01 K212 NA 2006-03-28 2
3: 39281 2011-10-09 K123 NA 2005-09-06 6
4: 39561 2011-10-19 K212 NA 2006-03-28 5
5: 74560 2015-04-07 K212 NA 2006-03-28 9
6: 77972 2015-09-06 K123 10 2005-09-06 10
7: 79111 2015-10-10 K212 9 2006-03-28 9
8: 84233 2016-03-28 K212 10 2006-03-28 10
Note: this algorithm could be improved. Consider K212 is 9 on 2015-10-10 and 10 on 2016-03-28. This means that K212's actual birthday is after 10-10 and before 3-28. Instead of assuming 3-28, we could assume it is halfway between 10-10 and 3-28, or, more specifically, if there is more than one estimated birthdate, calculate both the max and the min possible birthdate for each horse, then find the date that's halfway between max - 1 year and min.
Upvotes: 1
Reputation: 79188
data[,age.fill := nafill(age,'nocb'),by=horsenum][,
rdate.fill:=ifelse(is.na(age),rdate[which.min(age.fill==age)],rdate),by=horsenum][,
age.fill:=unclass(age.fill - round((rdate.fill-rdate)/365))
]
index rdate horsenum age ind4 age.fill rdate.fill
1: 14704 2009-03-01 K123 NA 10 3 2015-09-06
2: 14767 2009-03-01 K212 NA 9 2 2015-10-10
3: 39281 2011-10-09 K123 NA 10 6 2015-09-06
4: 39561 2011-10-19 K212 NA 9 5 2015-10-10
5: 74560 2015-04-07 K212 NA 9 8 2015-10-10
6: 77972 2015-09-06 K123 10 NA 10 2015-09-06
7: 79111 2015-10-10 K212 9 NA 9 2015-10-10
8: 84233 2016-03-28 K212 10 NA 10 2016-03-28
Upvotes: 1
Reputation: 25225
Not clear to me how age.fill
is calculated differently for rows 2 and 4 as compared to rows 1 and 3 respectively. But this should get you closer to your needs:
library(data.table) #data.table_1.12.4
DT[, rdate.fill := nafill(fifelse(is.na(age), as.Date(NA), rdate), "nocb"), horsenum][,
age.fill := fifelse(is.na(age), ind4 - ceiling(lubridate::time_length(difftime(rdate.fill, rdate), "years")), age), horsenum]
output:
index rdate horsenum age ind4 rdate.fill age.fill
1: 14704 2009-03-01 K123 NA 10 2015-09-06 3
2: 14767 2009-03-01 K212 NA 9 2015-10-10 2
3: 39281 2011-10-09 K123 NA 10 2015-09-06 6
4: 39561 2011-10-19 K212 NA 9 2015-10-10 5
5: 74560 2015-04-07 K212 NA 9 2015-10-10 8
6: 77972 2015-09-06 K123 10 NA 2015-09-06 10
7: 79111 2015-10-10 K212 9 NA 2015-10-10 9
8: 84233 2016-03-28 K212 10 NA 2016-03-28 10
Upvotes: 3