Reputation: 93
I have two dataframes
. The first (games
) shows, for each of several games, the year and which player accomplished certain unspecified goals (player1
, player2
, player3
). The second (rankings
) show the ranking of each player in a given year.
My goal is to add a column to the games
dataframe, indicating the average ranking of all players who accomplished those goals in each game.
A reproducible example:
set.seed(0)
players <- c("Abe", "Bob", "Chris", "John", "Jane", "Linda", "Mason", "Zoe", "NA")
years <- c(2000:2005)
season <- sample(years, 20, replace = TRUE)
player1 <- sample(players, 20, replace = TRUE)
player2 <- sample(players, 20, replace = TRUE)
player3 <- sample(players, 20, replace = TRUE)
games <- data.frame(season, player1, player2, player3, stringsAsFactors = FALSE)
rankings <- data.frame(replicate(6,sample(1:5,8,rep=TRUE)))
colnames(rankings) <- years
ranked_players <- players[-9]
rankings <- cbind(ranked_players, rankings)
The games
is the first dataframe
, showing the year of the game (season
), who was player1
, who was player2
and who was player3
. There isn't always a player for all categories for all games.
The rankings
is the second dataframe
, showing the ranking from 1 to 5 of each player in a given year.
I want to calculate what is the ranking of the player who played as player1
, player2
, and player3
respectively for each game in games, and average those rankings.
To calculate the ranking, I tried this function:
calc_ranking <- function(x, y) {
z <- select(filter(rankings, ranked_players==x), c(y))
z <- as.integer(z[1,1])
z
}
It apparently works. Now I have to apply it for each player who played a game in games and for every year.
I tried this loop:
new_col <- mapply(calc_ranking, games$player1, games$season)
but it doesn't work. It gives me an error
Error in inds_combine(.vars, ind_list) : Position must be between 0 and n
However, even if it worked, with this solution I should repeat the loop 3 times to create 3 columns, one for each role as player1
, player2
, and player3
, and then create the column I really want (the average of the 3 columns). I suspect there is a more efficient way to do it without repeating the loop (assuming I can fix it)? It would be very useful, because in my real dataset I have 13 "roles" for which I have to calculate the ranking.
Hope this second question is better than my first. Apologize for any mistake, I'm only 1 week into learning R (which is my first experience with coding in general).
Thanks a lot!
Upvotes: 3
Views: 111
Reputation: 11255
My understanding is that each row in games
is a separate game id. So for:
season player1 player2 player3
2001 Joe Bill Jane
player season ranking
Joe 2001 1
Bill 2001 3
Jane 2001 5
The intended answer is 3 for that one game. To address this, melting the data and then merging the two data.frames
on the season and player name would be the simplest route. Edit: added output similar to @heds with the dcast()
lines.
library(data.table)
setDT(games)
games[, game_id := seq_len(.N), keyby = season]
molten_games <- melt(games, id.vars = c('season', 'game_id'), variable.name = 'player_number', value.name = 'player')
setDT(rankings)
molten_rankings <- melt(rankings, id.vars = 'ranked_players', variable.name = 'season', value.name = 'ranking', variable.factor = F)[, season:= as.integer(season)]
merged_dt <- molten_rankings[molten_games
, on = .(season
, ranked_players = player )
, nomatch = 0L
]
merged_dt[, mean(ranking, na.rm = T), by = .(season, game_id)]
season game_id V1
1: 2000 1 2.666667
2: 2001 2 2.000000
3: 2001 3 1.333333
4: 2001 4 4.000000
5: 2002 1 3.333333
...
#or if you want all the players and rankings
dcast(merged_dt, season + game_id ~ player_number, value.var = c('ranked_players', 'ranking')
)[, means := rowMeans(.SD), .SDcols = c('ranking_player1', 'ranking_player2', 'ranking_player3')][]
season game_id ranked_players_player1 ranked_players_player2 ranked_players_player3 ranking_player1 ranking_player2 ranking_player3 means
1: 2000 1 John Mason Zoe 2 2 4 2.666667
2: 2001 1 <NA> Zoe <NA> NA 4 NA NA
3: 2001 2 Chris Jane Linda 1 1 4 2.000000
4: 2001 3 Jane Jane John 1 1 2 1.333333
5: 2001 4 Linda Zoe Zoe 4 4 4 4.000000
...
Since it appears you use dplyr
, this is a similar approach although I couldn't figure out the way to make it wide at the very end:
library(dplyr)
library(tidyr)
long_rankings <- rankings%>%
gather(key = 'season', value = 'ranking', - ranked_players)%>%
mutate(season = as.integer(season))
long_games <- games%>%
arrange(season)%>%
group_by(season)%>%
mutate(game_id = row_number())%>%
ungroup()%>%
gather(key = 'player_number', value = 'player', -season, - game_id)
inner_join(long_rankings
,long_games
, by = c('season' = 'season'
, 'ranked_players' = 'player'))%>%
group_by(season, game_id)%>%
summarize(game_rank_ave = mean(ranking, na.rm = T))
season game_id game_rank_ave
<int> <int> <dbl>
1 2000 1 2.67
2 2001 1 4
3 2001 2 2
4 2001 3 1.33
5 2001 4 4
6 2002 1 3.33
Regarding generating your data, please be careful about cbind()
!! It coerces objects into a matrix and matrices can only have one class such as character or numeric. I addressed the data.frame generation to address the issue.
Data used:
set.seed(0)
players <- c("Abe", "Bob", "Chris", "John", "Jane", "Linda", "Mason", "Zoe", "NA")
years <- c(2000:2005)
season <- sample(years, 20, replace = TRUE)
player1 <- sample(players, 20, replace = TRUE)
player2 <- sample(players, 20, replace = TRUE)
player3 <- sample(players, 20, replace = TRUE)
games <- data.frame(season, player1, player2, player3, stringsAsFactors = FALSE)
rankings <- data.frame(replicate(6,sample(1:5,8,rep=TRUE)))
colnames(rankings) <- years
ranked_players <- players[-9]
#rankings <- cbind(ranked_players, rankings) ##don't cbind unless you're making a matrix!
rankings$ranked_players <- players[-9]
Upvotes: 1
Reputation: 3438
I interpreted your desired output a bit differently than @Cole, being that you wanted the column of means of the player ranks per game. My strategy was to extract the players into their own dataframes (while having to change the column names to not be numeric). Then, for each of player1
, player2
and player3
, look for the player in that "position", and look up their ranking from their individual dataframe. I'm sure there are better ways of doing this but it worked in this case (although I'm not sure it's exactly what you're after.
Output:
> head(games,10)
season player1 player2 player3 player1_rank player2_rank player3_rank means
1 year2005 Mason John John 3 3 3 3.000000
2 year2001 <NA> Zoe <NA> NA 4 NA NA
3 year2002 Bob Linda Chris 3 4 3 3.333333
4 year2003 Linda Zoe Jane 5 5 3 4.333333
5 year2005 Bob Jane Chris 5 1 3 3.000000
6 year2001 Chris Jane Linda 1 1 4 2.000000
7 year2005 John Zoe Chris 3 3 3 3.000000
8 year2005 Abe Abe Jane 4 4 1 3.000000
9 year2003 John Jane Mason 1 3 3 2.333333
10 year2003 Zoe Mason Abe 5 3 5 4.333333
Code:
for (player in players){
temp_player <- filter(rankings, ranked_players == player)
colnames(temp_player) <- c("player","year2000","year2001","year2002","year2003","year2004","year2005")
assign(paste(player), temp_player)
}
games$season <- paste0("year",games$season)
games[games=="NA"] <- NA
i <- 1
for (rows in games$player1){
if (!is.na(games$player1[i])) {
season <- games$season[i]
games$player1_rank[i] <- get(games$player1[i])[,season]
}
else
{
games$player1_rank[i] <- NA
}
i <- i + 1
}
i <- 1
for (rows in games$player2){
if (!is.na(games$player2[i])) {
season <- games$season[i]
games$player2_rank[i] <- get(games$player2[i])[,season]
}
else
{
games$player2_rank[i] <- NA
}
i <- i + 1
}
i <- 1
for (rows in games$player3){
if (!is.na(games$player3[i])) {
season <- games$season[i]
games$player3_rank[i] <- get(games$player3[i])[,season]
}
else
{
games$player3_rank[i] <- NA
}
i <- i + 1
}
games$means <- rowMeans(games[,5:7]
Data:
set.seed(0)
players <- c("Abe", "Bob", "Chris", "John", "Jane", "Linda", "Mason", "Zoe", "NA")
years <- c(2000:2005)
season <- sample(years, 20, replace = TRUE)
player1 <- sample(players, 20, replace = TRUE)
player2 <- sample(players, 20, replace = TRUE)
player3 <- sample(players, 20, replace = TRUE)
games <- data.frame(season, player1, player2, player3, stringsAsFactors = FALSE)
rankings <- data.frame(replicate(6,sample(1:5,8,rep=TRUE)))
colnames(rankings) <- years
ranked_players <- players[-9]
rankings <- cbind(ranked_players, rankings)
Upvotes: 1