Reputation: 10431
Thanks in advance for the help with this. I'm not sure if I'm using apply
wrong, or simply breaking other rules that are slowing down my code. Any help is appreciated.
Overview: I have basketball data where each row is a moment in a basketball game and includes the 10 players on the court, their teams, the game, as well as how many minutes into the game (1 - 40) that row is at. Using this data, I am computing, for each player, the percentage of their team's games that they were on the court for each of the 1 - 40 minutes.
For example, if Joe's team played 20 games, and if in 13 of those games Joe was spotted in the data in the 5th minute of the game, then we would say that joe was spotted on court in the 5th minute of 65% of his team's games. I'm computing this for each player, for each season, for each of the 1-40 minutes, in my not-so-small data, and am running into performance issues. Here is the function I currently have for doing this:
library(dplyr)
# Raw Data Is Play-By-Play Data - Each Row contains stats for a pl (combination of 5 basketball players)
sheets_url <- 'https://docs.google.com/spreadsheets/d/1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o/export?format=csv&id=1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o&gid=630752085'
on.ct.data <- httr::content(httr::GET(url = sheets_url))
computeOnCourtByMinutePcts <- function(on.ct.data) {
# Create Dataframe With Number Of Games Played By Team Each Season
num.home.team.games <- on.ct.data %>%
dplyr::group_by(homeTeamId, season) %>%
dplyr::summarise(count = length(unique(gameId)))
num.away.team.games <- on.ct.data %>%
dplyr::group_by(awayTeamId, season) %>%
dplyr::summarise(count = length(unique(gameId)))
num.team.games <- num.home.team.games %>%
dplyr::full_join(num.away.team.games, by = c('homeTeamId'='awayTeamId', 'season'='season')) %>%
dplyr::mutate(gamesPlayed = rowSums(cbind(count.x, count.y), na.rm = TRUE)) %>%
dplyr::rename(teamId = homeTeamId) %>%
dplyr::mutate(season = as.character(season)) %>%
dplyr::select(teamId, season, gamesPlayed)
# Create Dataframe With Players By Season - Seems kind of bulky as well
all.player.season.apperances <- rbind(
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId1, season) %>% dplyr::rename(playerId = onCtHomeId1, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId2, season) %>% dplyr::rename(playerId = onCtHomeId2, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId3, season) %>% dplyr::rename(playerId = onCtHomeId3, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId4, season) %>% dplyr::rename(playerId = onCtHomeId4, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId5, season) %>% dplyr::rename(playerId = onCtHomeId5, teamId = homeTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId1, season) %>% dplyr::rename(playerId = onCtAwayId1, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId2, season) %>% dplyr::rename(playerId = onCtAwayId2, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId3, season) %>% dplyr::rename(playerId = onCtAwayId3, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId4, season) %>% dplyr::rename(playerId = onCtAwayId4, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId5, season) %>% dplyr::rename(playerId = onCtAwayId5, teamId = awayTeamId)) %>%
dplyr::distinct(teamId, playerId, season) %>%
dplyr::filter(!is.na(playerId))
# For Each Player-Season, Compute Number Of Games On Court at each minute in game - this is the bad Apply
playing.time.breakdowns <- apply(X = all.player.season.apperances, MARGIN = 1, FUN = function(thisRow) {
# Set Player / Season Variables
thisPlayerId = thisRow[2]
thisSeason = thisRow[3]
# Filter for each unique minute of each game with this player on court
on.court.df = on.ct.data %>%
dplyr::filter(onCtHomeId1 == thisPlayerId | onCtHomeId2 == thisPlayerId | onCtHomeId3 == thisPlayerId | onCtHomeId4 == thisPlayerId | onCtHomeId5 == thisPlayerId |
onCtAwayId1 == thisPlayerId | onCtAwayId2 == thisPlayerId | onCtAwayId3 == thisPlayerId | onCtAwayId4 == thisPlayerId | onCtAwayId5 == thisPlayerId) %>%
dplyr::filter(season == thisSeason) %>%
dplyr::filter(!duplicated(paste0(gameId, minNumIntoGame)))
# Turn This Into a table of minutes on court by game
thisTable <- table(on.court.df$minNumIntoGame)
this.player.distrubution.df <- data.frame(
playerId = thisRow[2],
teamId = thisRow[1],
season = thisRow[3],
minNumIntoGame = as.integer(names(thisTable)),
numGamesAtMinNum = unname(thisTable) %>% as.vector(),
stringsAsFactors = FALSE
)
# 40 minutes in basketball game, so previous dataframe needs 40 rows
if(length(which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame))) > 0) {
zero.mins.played.df <- data.frame(
playerId = thisRow[2],
teamId = thisRow[1],
season = thisRow[3],
minNumIntoGame = which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame)),
numGamesAtMinNum = 0,
stringsAsFactors = FALSE
)
this.player.distrubution.df <- plyr::rbind.fill(this.player.distrubution.df, zero.mins.played.df) %>% dplyr::arrange(minNumIntoGame)
}
# and return
return(this.player.distrubution.df)
})
# Combine the output into one dataframe
playing.time.breakdowns <- playing.time.breakdowns %>% do.call("rbind", .)
# Join on Team-Games played
playing.time.breakdowns <- playing.time.breakdowns %>%
dplyr::left_join(num.team.games, by = c("teamId"="teamId", "season"="season")) %>%
dplyr::rename(teamGamesPlayed = gamesPlayed)
# Compute pct of games played
playing.time.breakdowns <- playing.time.breakdowns %>%
dplyr::mutate(pctMinNumPlayed = round(numGamesAtMinNum / teamGamesPlayed, 3))
# Handle OT (minNumIntoGame > 40) needs a lower gamesPlayed denominator...
# And Return
return(playing.time.breakdowns);
}
on.ct.by.min <- computeOnCourtByMinutePcts(on.ct.data)
In summary, the code does the following:
onCt
columns) for each minute of each game, (b) convert this into a table that shows number of games the player was on court at each of the 1-40 minutes.Note that it may be easier to follow the apply
function by manually running it for one row of all.player.season.appearances
. Set thisRow to any row in the dataframe, and run the code line by line for a bit of clarity.
To highlight the slow-code issues, I have uploaded a large chunk of play-by-play / on-court data to google sheets, made it public, and included the link to load the data in the code above. Google Sheets has ~1/2 of my current data, however my total data size is expected to increase by a factor of 10x in the near future, and the code currently takes ~8 minutes to run on my computer. This is a script that needs to be run daily and fairly quickly, and I cannot afford for this one function to take 80 minutes.
It feels like my apply()
call is not well done, as if it's no faster than an ordinary for loop. I'm not certain that apply is needed at all, and in fact, I don't think it is. But I have been struggling over the last 24 hours thinking about how to improve this function, with no luck. There must be a better approach here!
Edit: I have a minor bug in the reproducible example, which I am working on currently. Edit2: fixed issue that was creating NAs in the num.team.games
dataframe. I just ran the code and it appears to be working correctly. There are ~600 rows of output where the teamId is NA, which is nothing to worry about.
Edit3: It looks like each iteration of the apply takes 0.06 seconds, and there are 5312 rows in the dataframe, which adds up to the ~8 minute run time. Should I be trying to reduce that 0.06 to <0.01, or ditch this whole approach? This is a main Q that I'm not sure about...
Upvotes: 0
Views: 57
Reputation: 66705
I think this can be approached more simply by converting the data to long form and counting player-minute-team-season combinations. (This takes about 5 seconds to run on this old computer from 2008, and is most of the calculation.)
library(tidyverse)
on.ct.data %>%
gather(spot, name, onCtHomeId1:onCtAwayId5) %>%
mutate(team = if_else(spot %>% str_detect("Away"),
awayTeamId, homeTeamId)) %>%
select(-spot) %>% # For this part, I only care about person and minute of game.
distinct() %>% # Drop dupes and instances where they were repositioned within one minute.
drop_na() %>%
select(-c(gameId:awayTeamId)) %>%
count(minNumIntoGame, name, team, season)
# A tibble: 140,581 x 5
minNumIntoGame name team season n
<dbl> <chr> <chr> <dbl> <int>
1 1 AahmaneSantos387c JAC 1819 1
2 1 AamirSimmseef9 CLEM 1819 13
3 1 AarenEdmead9cd6 NCAT 1718 1
4 1 AarenEdmead9cd6 NCAT 1819 1
5 1 AaronBrennanbee2 IUPU 1718 1
6 1 AaronCalixtea11d OKLA 1819 11
7 1 AaronCarver9cfa ODU 1819 2
8 1 AaronClarke3d67 SHU 1819 1
9 1 AaronFalzon213b NW 1718 1
10 1 AaronHolidayfce6 UCLA 1718 11
Now that we have that, we can check what our game universe looks like for each team. In how many games each season did each team play a given minute?
on.ct.data.team.minutes <- on.ct.data.minute.counts %>%
count(season, team, minNumIntoGame, gameId) %>%
count(season, team, minNumIntoGame)
ggplot(on.ct.data.team.minutes %>% slice(1:1000),
aes(minNumIntoGame, team, fill = n)) +
geom_tile() + facet_wrap(~season) +
labs(title = "# times each team played each minute (excerpt)")
...and we can do the same for each player and compare to their team, to see what share of each minute they played for their team.
# How many games each season did each player play a given minute for each team?
on.ct.data.player.minutes <- on.ct.data.minute.counts %>%
count(season, team, name, minNumIntoGame) %>%
rename(player_n = n) %>%
left_join(on.ct.data.team.minutes) %>%
rename(team_n = n) %>%
mutate(player_time = player_n / team_n)
ggplot(on.ct.data.player.minutes %>% filter(name %>% str_detect("Can")),
aes(minNumIntoGame, player_time, color = name)) +
geom_line() + facet_wrap(~season) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1))
Upvotes: 1