Reputation: 41
I have a data set contain horse racing data. I write a function to calculate the average recent 6 ranks for each row's horse on its racing date if the avgl6rank value is missing. The function runs very slow so I want to speed up by using data.table. But I have trouble how can I create groups for each row with rdate before each row's date and group by horsenum. Where can I insert date filtering in
jc.data[is.na(avgl6rank), avgl6rank:= round(mean(tail(rank, 6),na.rm = T)), by = horsenum]
The expected output is the same as the input data frame, just fill in missing values in avgl6rank column if we can find recent 6 or less racing records for this horse before that row's date, and calculate average ranks.
Avg6RankF <- function(df){
if (!is.na(df["avgl6rank"])) {
return(df["avgl6rank"])
} else {
tmp <- subset(jc.data, horsenum == df["horsenum"] & rdate < df["rdate"])
if (nrow(tmp) > 0) {
return(round(mean(tail(tmp$rank, 6),na.rm = TRUE)))
} else {
return(NA)
}
}
}
jc.data['avgl6rank'] <- apply(jc.data, 1, Avg6RankF)
structure(list(index = c(64L, 577L, 33704L, 34538L, 35753L, 36119L,
36641L, 38259L, 38484L, 39060L, 40507L, 41326L, 41814L, 41938L,
42537L, 43006L, 43073L, 43354L, 44056L, 44186L, 44615L, 44665L,
45385L, 46060L, 46636L, 47134L, 47526L, 48030L, 48176L, 48799L,
50485L, 51167L, 51660L, 52006L, 52703L, 53352L, 53806L, 54366L,
55055L, 56041L, 56496L, 56991L, 57718L, 58303L, 59036L, 59717L,
60211L, 61142L, 61776L, 62348L, 63042L, 63755L, 64474L, 65063L,
66355L, 66663L, 67179L, 67415L, 68015L, 68345L, 69616L, 71067L,
72162L, 74472L, 75555L, 76018L, 76754L, 77463L, 79022L, 79740L,
81273L, 81885L, 83136L, 83468L, 84202L, 84937L, 85681L, 87446L,
88375L, 89242L), rdate = structure(c(13765L, 13782L, 15025L,
15049L, 15089L, 15101L, 15115L, 15228L, 15235L, 15253L, 15298L,
15322L, 15340L, 15343L, 15360L, 15375L, 15375L, 15385L, 15406L,
15409L, 15424L, 15424L, 15451L, 15472L, 15494L, 15508L, 15522L,
15536L, 15591L, 15614L, 15669L, 15690L, 15706L, 15717L, 15738L,
15760L, 15774L, 15791L, 15815L, 15843L, 15858L, 15872L, 15893L,
15963L, 15990L, 16012L, 16026L, 16054L, 16071L, 16089L, 16109L,
16130L, 16152L, 16169L, 16211L, 16222L, 16236L, 16243L, 16257L,
16334L, 16372L, 16418L, 16453L, 16526L, 16561L, 16575L, 16596L,
16617L, 16715L, 16740L, 16792L, 16813L, 16852L, 16862L, 16883L,
16904L, 16928L, 16988L, 17072L, 17100L), class = c("IDate", "Date"
)), rid = c(5L, 1L, 2L, 4L, 3L, 2L, 5L, 1L, 1L, 1L, 3L, 2L, 3L,
1L, 5L, 1L, 6L, 1L, 1L, 4L, 1L, 5L, 5L, 4L, 2L, 5L, 6L, 6L, 6L,
5L, 6L, 5L, 7L, 5L, 4L, 6L, 6L, 3L, 4L, 6L, 6L, 4L, 4L, 6L, 6L,
5L, 6L, 5L, 7L, 5L, 6L, 3L, 4L, 4L, 5L, 2L, 6L, 5L, 9L, 6L, 6L,
9L, 9L, 3L, 3L, 3L, 4L, 5L, 3L, 6L, 2L, 2L, 4L, 3L, 6L, 9L, 9L,
7L, 3L, 3L), horsenum = c("D350", "D350", "M133", "M133", "M133",
"M133", "M133", "M133", "M133", "M133", "M350", "M133", "M350",
"M133", "M350", "M133", "M350", "M133", "M133", "M350", "M133",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350", "M350", "M350", "M350", "M350", "M350",
"M350", "M350", "M350"), rank = c(12, 9, 9, 11, 10, 10, 12, 14,
14, 6, 10, 9, 10, 11, 4, 6, 12, 6, 10, 9, 14, 10, 9, 5, 3, 1,
1, 10, 11, 8, 10, 9, 4, 1, 3, 7, 2, 7, 7, 4, 1, 5, 3, 13, 7,
6, 3, 2, 4, 6, 5, 3, 6, 4, 6, 6, 1, 1, 7, 7, 11, 7, 6, 3, 3,
4, 8, 14, 1, 11, 10, 8, 10, 1, 10, 11, 11, 11, 10, 11), avgl6rank = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 6, 6, 7, 8, 9, 9, 7, 8, 8, 8, 9, 9)), row.names = c(NA,
-80L), .internal.selfref = <pointer: 0x000001e914901ef0>, class = "data.frame")
Upvotes: 0
Views: 117
Reputation: 27732
You can use the latset data.table (version 1.12.4, released 2 days ago), which supports rolling means.
see #27 here: https://github.com/Rdatatable/data.table/blob/master/NEWS.md
library(data.table)
setDT(DT)
#use keys to sort DT by date by horsenum
setkey( DT, horsenum, rdate )
#use the new `frollmean()` function on a rolling window with length 6
DT[ is.na( avgl6rank ), avgl6rank_2 := frollmean( rank, 6L ), by = .( horsenum )][]
Upvotes: 5
Reputation: 388982
Using dplyr
and purrr::map2_dbl
you could try :
library(dplyr)
df %>%
arrange(rdate) %>%
group_by(horsenum) %>%
mutate(avgl = purrr::map2_dbl(row_number(), avgl6rank,
~ if(is.na(.y)) mean(rank[max(1,.x - 6):.x], na.rm = TRUE) else .y))
# index rdate rid horsenum jname tname pricemoney rank avgl6rank avgl
# <int> <date> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
# 1 64 2007-09-09 5 D350 S Dye C Fownes NA 12 NA 12
# 2 577 2007-09-26 1 D350 W M Lai C Fownes NA 9 NA 10.5
# 3 40507 2011-11-20 3 M350 D Beadman J Moore NA 10 NA 10
# 4 41814 2012-01-01 3 M350 D Beadman J Moore NA 10 NA 10
# 5 42537 2012-01-21 5 M350 D Beadman J Moore NA 4 NA 8
# 6 43073 2012-02-05 6 M350 T Clark J Moore NA 12 NA 9
# 7 44186 2012-03-10 4 M350 N Callan J Moore NA 9 NA 9
# 8 44665 2012-03-25 5 M350 T Clark J Moore NA 10 NA 9.17
# 9 45385 2012-04-21 5 M350 J Lloyd J Moore NA 9 NA 9.14
#10 46060 2012-05-12 4 M350 Y T Cheng J Moore NA 5 NA 8.43
# … with 56 more rows
Upvotes: 2