Reputation: 111
I am trying to calculate the average number of goals home and away team has scored, but "todays" game.
The data can be found here : http://www.football-data.co.uk/mmz4281/1415/E0.csv
My code
pl <- pl[,2:6]
pl$Date <- as.Date(pl$Date, "%d/%m/%y")
pl$HomeTeam <- as.character(pl$HomeTeam)
pl$AwayTeam <- as.character(pl$AwayTeam)
pl.func <- function(tf){
tf$avg.ht <- rep(NA,nrow(tf))
tf$avg.at <- rep(NA,nrow(tf))
for(i in 1:nrow(tf)){
tf$avg.ht[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$HomeTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i])
tf$avg.at[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$AwayTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i])
}
return(tf)
}
pl <- pl.func(pl)
I need to "match" on team, and a earlier date. The above code works, but is slow as I want to calculate several hundreds of calculation. Can anyone hint or show how I can do this with some kind of apply function? I could not succed as I dont know to to replace the [i] argument from the loop on a correct way.
Upvotes: 1
Views: 415
Reputation: 57210
Here are some possible improvements (and a final benchmark) :
1) this is a modified version of your function with just some improvements in the loop :
pl.func2 <- function(DF){
DF$avg.ht <- rep(NA,nrow(DF))
DF$avg.at <- rep(NA,nrow(DF))
for(i in 1:nrow(DF)){
currDate <- DF$Date[i]
currHT <- DF$HomeTeam[i]
currAT <- DF$AwayTeam[i]
prevHT.eq.HT <- which(DF$HomeTeam == currHT & DF$Date < currDate)
prevHT.eq.AT <- which(DF$HomeTeam == currAT & DF$Date < currDate)
prevAT.eq.HT <- which(DF$AwayTeam == currHT & DF$Date < currDate)
prevAT.eq.AT <- which(DF$AwayTeam == currAT & DF$Date < currDate)
DF$avg.ht[i] <- (sum(DF$FTHG[prevHT.eq.HT]) + sum(tf$FTAG[prevAT.eq.HT])) / (length(prevHT.eq.HT) + length(prevAT.eq.HT))
DF$avg.at[i] <- (sum(DF$FTHG[prevHT.eq.AT]) + sum(tf$FTAG[prevAT.eq.AT])) / (length(prevHT.eq.AT) + length(prevAT.eq.AT))
}
return(DF)
}
2) this is another modified version of your function which uses cumulated infos to avoid subsetting and sum all the previous days (N.B. this requires the data.frame to be ordered by Date):
pl.func3 <- function(DF){
DF$avg.ht <- rep(NA,nrow(DF))
DF$avg.at <- rep(NA,nrow(DF))
teams <- unique(c(DF$HomeTeam,DF$AwayTeam))
cumul.info <- t(sapply(teams,FUN=function(team) c(cumulFTG=0,cumulMatches=0)))
# store column indexes to reuse them
cumulFTG <- 1
cumulMatches <- 2
for(i in 1:nrow(DF)){
currHT <- DF$HomeTeam[i]
currAT <- DF$AwayTeam[i]
DF$avg.ht[i] <- cumul.info[currHT,cumulFTG] / cumul.info[currHT,cumulMatches]
DF$avg.at[i] <- cumul.info[currAT,cumulFTG] / cumul.info[currAT,cumulMatches]
cumul.info[currHT,cumulFTG] = cumul.info[currHT,cumulFTG] + DF$FTHG[i]
cumul.info[currHT,cumulMatches] = cumul.info[currHT,cumulMatches] + 1
cumul.info[currAT,cumulFTG] = cumul.info[currAT,cumulFTG] + DF$FTAG[i]
cumul.info[currAT,cumulMatches] = cumul.info[currAT,cumulMatches] + 1
}
return(DF)
}
Check and benchmark :
# this is necessary for pl.func3
pl <- pl[order(pl$Date),]
# are the results identical ? -> TRUE
identical(pl.func(pl),pl.func2(pl)) && identical(pl.func(pl),pl.func3(pl))
# benchmark
library(microbenchmark)
microbenchmark(pl.func(pl),pl.func2(pl),pl.func3(pl))
Unit: milliseconds
expr min lq mean median uq max neval cld
pl.func(pl) 184.36644 186.10643 188.38130 187.16322 188.80065 255.2101 100 c
pl.func2(pl) 84.95047 85.80966 89.27945 87.41589 88.33845 159.6284 100 b
pl.func3(pl) 30.72683 31.05515 32.02944 31.41211 33.22858 35.8644 100 a
Upvotes: 1
Reputation: 107567
What you actually need are running conditional averages. Recently, I answered a similiar question where the OP needed running averages every 15 minutes by group where you need running averages for every past game played by team.
So consider the following sapply()
approach which using sample data and running your code alongside, returns equivalent output. Possibly performance may be more enhanced for your needs:
pl$runavgHT <- sapply(1:nrow(pl),
function(i) {
(sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
* pl[1:i,]$FTHG) +
sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("AwayTeam")] == pl$HomeTeam[i]))
* pl[1:i,]$FTAG)) /
sum(((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
|((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("AwayTeam")] == pl$HomeTeam[i])))
}
)
pl$runavgAT <- sapply(1:nrow(pl),
function(i) {
(sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
* pl[1:i,]$FTHG) +
sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("AwayTeam")] == pl$AwayTeam[i]))
* pl[1:i,]$FTAG)) /
sum(((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
|((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("AwayTeam")] == pl$AwayTeam[i])))
}
)
Upvotes: 1