Reputation:
So I have data with three columns. You will notice Cincinnati has 10 rows, while Clemson has 8 rows.
Data
structure(list(player_id = c(473L, 653L, 816L, 885L, 906L, 969L,
998L, 102L, 106L, 107L, 23L, 33L, 44L, 67L, 74L, 80L, 87L, 91L
), rating = c(0.8756, 0.8646, 0.8572, 0.8547, 0.8539, 0.8519,
0.8506, 0.8498, 0.8477, 0.8477, 0.9867, 0.9822, 0.9764, 0.9673,
0.9654, 0.9644, 0.9614, 0.9566), school = c("Cincinnati", "Cincinnati",
"Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati",
"Cincinnati", "Cincinnati", "Cincinnati", "Clemson", "Clemson",
"Clemson", "Clemson", "Clemson", "Clemson", "Clemson", "Clemson"
)), class = "data.frame", row.names = c(NA, -18L))
player_id rating school
473 0.8756 Cincinnati
653 0.8646 Cincinnati
816 0.8572 Cincinnati
885 0.8547 Cincinnati
906 0.8539 Cincinnati
969 0.8519 Cincinnati
998 0.8506 Cincinnati
102 0.8498 Cincinnati
106 0.8477 Cincinnati
107 0.8477 Cincinnati
23 0.9867 Clemson
33 0.9822 Clemson
44 0.9764 Clemson
67 0.9673 Clemson
74 0.9654 Clemson
80 0.9644 Clemson
87 0.9614 Clemson
91 0.9566 Clemson
The goal is, I'm trying to get any school who doesn't have 10 rows to 10 rows by creating a "Placeholder" row of sorts that is based off the current average of the rating field. So for Clemson, that would include two "Placeholder" rows with a rating of 0.9701. So the final output would look something like this:
player_id rating school
473 0.8756 Cincinnati
653 0.8646 Cincinnati
816 0.8572 Cincinnati
885 0.8547 Cincinnati
906 0.8539 Cincinnati
969 0.8519 Cincinnati
998 0.8506 Cincinnati
102 0.8498 Cincinnati
106 0.8477 Cincinnati
107 0.8477 Cincinnati
23 0.9867 Clemson
33 0.9822 Clemson
44 0.9764 Clemson
67 0.9673 Clemson
74 0.9654 Clemson
80 0.9644 Clemson
87 0.9614 Clemson
91 0.9566 Clemson
0 0.9701 Clemson
0 0.9701 Clemson
What would be the most effective way of achieving this?
Upvotes: 3
Views: 56
Reputation: 887098
We could expand the dataset with complete
on the basis of a row_number
column after grouping by 'school', then replace
the NA
element with the mean
of the 'rating'
library(dplyr)
library(tidyr)
n <- 10
df1 %>%
group_by(school) %>%
mutate(rn = row_number()) %>%
ungroup %>%
complete(school, rn = unique(rn), fill = list(player_id = 0)) %>%
# // if all groups should be having fixed number of rows
# complete(school, rn = seq_len(n), fill = list(player_id = 0))
group_by(school) %>%
mutate(rating = replace_na(rating, mean(rating, na.rm = TRUE)))
# A tibble: 20 x 4
# Groups: school [2]
# school rn player_id rating
# <chr> <int> <dbl> <dbl>
# 1 Cincinnati 1 473 0.876
# 2 Cincinnati 2 653 0.865
#3 Cincinnati 3 816 0.857
# 4 Cincinnati 4 885 0.855
# 5 Cincinnati 5 906 0.854
# 6 Cincinnati 6 969 0.852
# 7 Cincinnati 7 998 0.851
# 8 Cincinnati 8 102 0.850
# 9 Cincinnati 9 106 0.848
#10 Cincinnati 10 107 0.848
#11 Clemson 1 23 0.987
#12 Clemson 2 33 0.982
#13 Clemson 3 44 0.976
#14 Clemson 4 67 0.967
#15 Clemson 5 74 0.965
#16 Clemson 6 80 0.964
#17 Clemson 7 87 0.961
#18 Clemson 8 91 0.957
#19 Clemson 9 0 0.970
#20 Clemson 10 0 0.970
With the OP's new dataset
n <- 20
out <- df1 %>%
group_by(school) %>%
filter(n() < n) %>%
mutate(rn = row_number()) %>%
ungroup %>%
complete(school, rn = seq_len(n), fill = list(rank= 0)) %>%
group_by(school) %>%
mutate(rating = replace_na(rating, mean(rating, na.rm = TRUE))) %>%
bind_rows(df1 %>%
group_by(school) %>%
filter(n() >=n))
range(table(out$school))
#[1] 20 57
Or using data.table
library(data.table)
library(zoo)
setDT(df1)[, .SD[seq_len(max(c(n, .N)))], school
][, rating := fifelse(is.na(rating), mean(rating, na.rm = TRUE), rating),
school][, lapply(.SD, na.locf0)]
df1 <- structure(list(player_id = c(473L, 653L, 816L, 885L, 906L, 969L,
998L, 102L, 106L, 107L, 23L, 33L, 44L, 67L, 74L, 80L, 87L, 91L
), rating = c(0.8756, 0.8646, 0.8572, 0.8547, 0.8539, 0.8519,
0.8506, 0.8498, 0.8477, 0.8477, 0.9867, 0.9822, 0.9764, 0.9673,
0.9654, 0.9644, 0.9614, 0.9566), school = c("Cincinnati", "Cincinnati",
"Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati", "Cincinnati",
"Cincinnati", "Cincinnati", "Cincinnati", "Clemson", "Clemson",
"Clemson", "Clemson", "Clemson", "Clemson", "Clemson", "Clemson"
)), class = "data.frame", row.names = c(NA, -18L))
Upvotes: 3
Reputation: 76402
Here is a base R way, with the split/apply/combine
strategy, see Hadley's JSS article.
res <- lapply(split(df1, df1$school), function(X){
if(nrow(X) < 10){
tmp <- data.frame(player_id = rep(0, 10 - nrow(X)),
rating = mean(X[['rating']]),
school = X[1, 'school'])
rbind(X, tmp)
} else X
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
# player_id rating school
#1 473 0.87560 Cincinnati
#2 653 0.86460 Cincinnati
#3 816 0.85720 Cincinnati
#4 885 0.85470 Cincinnati
#5 906 0.85390 Cincinnati
#6 969 0.85190 Cincinnati
#7 998 0.85060 Cincinnati
#8 102 0.84980 Cincinnati
#9 106 0.84770 Cincinnati
#10 107 0.84770 Cincinnati
#11 23 0.98670 Clemson
#12 33 0.98220 Clemson
#13 44 0.97640 Clemson
#14 67 0.96730 Clemson
#15 74 0.96540 Clemson
#16 80 0.96440 Clemson
#17 87 0.96140 Clemson
#18 91 0.95660 Clemson
#19 0 0.97005 Clemson
#20 0 0.97005 Clemson
Upvotes: 1