Reputation: 10481
I have a difficult dataframe problem where I am trying to create new columns / column names / column values out of an existing dataframe that isn't formatted like I want it to be. The data has playerIDs and playerTypes for 4 different players at a time, and looks like this:
dput(my.player.data)
structure(list(p_id = c(8470828L, 8478460L, 8470966L, 8475314L,
8476472L, 8476917L, 8475791L, 8470105L, 8476905L, 8474152L, 8470642L,
8479325L, 8475218L, 8471296L, 8476874L, 8477943L, 8477934L, 8473432L
), pType = c("Blocker", "Shooter", "Blocker", "Shooter", "Blocker",
"Hitter", "Blocker", "Shooter", "PlayerID", "PlayerID", "Shooter",
"Hitter", "PlayerID", "Blocker", "Shooter", "Scorer", "Scorer",
"Scorer"), p_id1 = c(8475172L, 8470645L, 8474162L, NA, 8480172L,
8477989L, 8476879L, NA, NA, NA, NA, 8474683L, NA, 8476851L, 8469514L,
8477407L, 8478402L, 8474091L), pType1 = c("Shooter", "Goalie",
"Shooter", NA, "Shooter", "Hittee", "Shooter", NA, NA, NA, NA,
"Hittee", NA, "Shooter", "Goalie", "Assist", "Assist", "Assist"
), p_id2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 8475246L, 8471729L, 8477018L), pType2 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Assist",
"Assist", "Assist"), p_id3 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 8475622L, 8471239L, 8469608L), pType3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Goalie",
"Goalie", "Goalie")), .Names = c("p_id", "pType", "p_id1", "pType1",
"p_id2", "pType2", "p_id3", "pType3"), row.names = c(1L, 5001L,
10001L, 15001L, 20001L, 25001L, 30001L, 35001L, 40001L, 45001L,
50001L, 55001L, 60001L, 65001L, 70001L, 47329L, 46786L, 45551L
), class = "data.frame")
# ignore that the row numbers are 1, 5000, 10000, etc.
head(my.player.data)
p_id pType p_id1 pType1 p_id2 pType2 p_id3 pType3
1 8470828 Blocker 8475172 Shooter NA <NA> NA <NA>
5001 8478460 Shooter 8470645 Goalie NA <NA> NA <NA>
10001 8470966 Blocker 8474162 Shooter NA <NA> NA <NA>
15001 8475314 Shooter NA <NA> NA <NA> NA <NA>
20001 8476472 Blocker 8480172 Shooter NA <NA> NA <NA>
25001 8476917 Hitter 8477989 Hittee NA <NA> NA <NA>
There are only a fixed number of pTypes in my data across the 4 pType columns, (Blocker, Shooter, Goalie, etc.) and I would like to create a column for each one of these, with the value in the column equal to the respective playerID.
For example, I'd like something that looks like this:
head(better.player.data)
Blocker Shooter Hittee Hitter Assist1 Assist2 Scorer Goalie
1 8470828 8475172 NA NA NA NA NA NA
5001 NA 8478460 NA NA NA NA NA 8470645
10001 8470966 8474162 NA NA NA NA NA NA
15001 NA 8475314 NA NA NA NA NA NA
20001 8476472 8480172 NA NA NA NA NA NA
25001 NA NA 8477989 8476917 NA NA NA NA
The main edge-case here is that Assist1 and Assist2 are both labeled as Assist in the my.player.data dataframe (see the last 3 rows, not shown in the head()). I'd like for p_id1 to be Assist1 and p_id2 to be Assist2 (pType1 and pType2 should be the only 2 columns in the original data where the value is Assist (shouldnt be in pType or pType3)
Any help with this, as always, is greatly appreciated! Thanks!
Upvotes: 1
Views: 245
Reputation: 6479
Here's a base R version:
gty <- function(.) unlist(.[grep("pType", names(.))])
# a function to get the vector of "types" from a row of the data frame
# edit: I noticed the "Assist" problem ("Assist1" and "Assist2" in the target table but "Assist" in the source table)
# ... so this needs to be corrected
gty <- function(.) {
res <- unlist(.[grep("pType", names(.))])
res[res %in% "Assist"] <- paste0("Assist", 1:sum(res%in%"Assist"))
res
}
# or in a hyper-functional style
gty <- function(.){ (function(..) {"[<-" (.., ..%in%"Assist", paste0("Assist", 1:sum(..%in%"Assist")))})(unlist(.[grep("pType", names(.))]))}
gid <- function(.) as.numeric(unlist(.[grep("_id", names(.))]))
# a function to get the vector of "ids" from a row of the data frame
# as.numeric - needed if this will be used from within `apply`as I intend to
types <- c("Blocker", "Shooter", "Hittee", "Hitter", "Assist1", "Assist2",
"Scorer", "Goalie")
# a list of types
fun <- function(.) setNames(gid(.)[match(types, gty(.))], types)
# a function which gets the ids and types from a row and rearranges them
t(apply(df, 1, fun))
And the result:
Blocker Shooter Hittee Hitter Assist1 Assist2 Scorer Goalie
1 8470828 8475172 NA NA NA NA NA NA
5001 NA 8478460 NA NA NA NA NA 8470645
10001 8470966 8474162 NA NA NA NA NA NA
15001 NA 8475314 NA NA NA NA NA NA
20001 8476472 8480172 NA NA NA NA NA NA
25001 NA NA 8477989 8476917 NA NA NA NA
30001 8475791 8476879 NA NA NA NA NA NA
35001 NA 8470105 NA NA NA NA NA NA
40001 NA NA NA NA NA NA NA NA
45001 NA NA NA NA NA NA NA NA
50001 NA 8470642 NA NA NA NA NA NA
55001 NA NA 8474683 8479325 NA NA NA NA
60001 NA NA NA NA NA NA NA NA
65001 8471296 8476851 NA NA NA NA NA NA
70001 NA 8476874 NA NA NA NA NA 8469514
47329 NA NA NA NA 8477407 8475246 8477943 8475622
46786 NA NA NA NA 8478402 8471729 8477934 8471239
45551 NA NA NA NA 8474091 8477018 8473432 8469608
Upvotes: 1
Reputation: 79338
library(data.table)
dcast(na.omit(
melt(setDT(df)[,id:=1:nrow(df)],"id",list(grep("p_id",names(df)),grep("pType",names(df))))
),
id~value2,value.var = "value1")
id Assist1 Assist2 Blocker Goalie Hittee Hitter PlayerID Scorer Shooter
1: 1 NA NA 8470828 NA NA NA NA NA 8475172
2: 2 NA NA NA 8470645 NA NA NA NA 8478460
3: 3 NA NA 8470966 NA NA NA NA NA 8474162
4: 4 NA NA NA NA NA NA NA NA 8475314
5: 5 NA NA 8476472 NA NA NA NA NA 8480172
6: 6 NA NA NA NA 8477989 8476917 NA NA NA
7: 7 NA NA 8475791 NA NA NA NA NA 8476879
8: 8 NA NA NA NA NA NA NA NA 8470105
9: 9 NA NA NA NA NA NA 8476905 NA NA
10: 10 NA NA NA NA NA NA 8474152 NA NA
11: 11 NA NA NA NA NA NA NA NA 8470642
12: 12 NA NA NA NA 8474683 8479325 NA NA NA
13: 13 NA NA NA NA NA NA 8475218 NA NA
14: 14 NA NA 8471296 NA NA NA NA NA 8476851
15: 15 NA NA NA 8469514 NA NA NA NA 8476874
16: 16 8477407 8475246 NA 8475622 NA NA NA 8477943 NA
17: 17 8478402 8471729 NA 8471239 NA NA NA 8477934 NA
18: 18 8474091 8477018 NA 8469608 NA NA NA 8473432 NA
if you want a base R approach: you can do:
as.data.frame.matrix(
xtabs(p_id~id+pType,
reshape(transform(df,pType1 =sub("(Assist)","\\11",pType1),pType2 = sub("(Assist)","\\12",pType2), id=1:nrow(df)),matrix(1:(ncol(a)-1),2),dir="long")),
row.names(df))
Assist1 Assist2 Blocker Goalie Hittee Hitter PlayerID Scorer Shooter
1 0 0 8470828 0 0 0 0 0 8475172
5001 0 0 0 8470645 0 0 0 0 8478460
10001 0 0 8470966 0 0 0 0 0 8474162
15001 0 0 0 0 0 0 0 0 8475314
20001 0 0 8476472 0 0 0 0 0 8480172
25001 0 0 0 0 8477989 8476917 0 0 0
30001 0 0 8475791 0 0 0 0 0 8476879
35001 0 0 0 0 0 0 0 0 8470105
40001 0 0 0 0 0 0 8476905 0 0
45001 0 0 0 0 0 0 8474152 0 0
50001 0 0 0 0 0 0 0 0 8470642
55001 0 0 0 0 8474683 8479325 0 0 0
60001 0 0 0 0 0 0 8475218 0 0
65001 0 0 8471296 0 0 0 0 0 8476851
70001 0 0 0 8469514 0 0 0 0 8476874
47329 8477407 8475246 0 8475622 0 0 0 8477943 0
46786 8478402 8471729 0 8471239 0 0 0 8477934 0
45551 8474091 8477018 0 8469608 0 0 0 8473432 0
Upvotes: 2
Reputation: 20095
One solution can be achieved using tidyverse
. The idea is to gather
convert in long format with rowname
having matching pType
and p_id
columns. First create a group based on number associated with p_id
and pType
columns. Perform modification to convert Assist
to Assist1
and Assist2
. Finally, call spread to convert data in desired format.
library(tidyverse)
my.player.data %>% rownames_to_column %>%
mutate(rowname = as.numeric(rowname)) %>%
gather(Key, Value, -rowname) %>%
filter(!is.na(Value)) %>%
mutate(Group = as.integer(gsub("(p_id|pType)","0",Key))) %>%
mutate(Value = ifelse(Value == "Assist", paste0(Value, Group), Value)) %>%
mutate(Key = gsub("\\d","",Key)) %>% #Remove number from p_id and pType columns
spread(Key, Value) %>%
select(-Group) %>%
spread(pType, p_id) %>%
remove_rownames() %>%
column_to_rownames()
# Assist1 Assist2 Blocker Goalie Hittee Hitter PlayerID Scorer Shooter
# 1 <NA> <NA> 8470828 <NA> <NA> <NA> <NA> <NA> 8475172
# 5001 <NA> <NA> <NA> 8470645 <NA> <NA> <NA> <NA> 8478460
# 10001 <NA> <NA> 8470966 <NA> <NA> <NA> <NA> <NA> 8474162
# 15001 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 8475314
# 20001 <NA> <NA> 8476472 <NA> <NA> <NA> <NA> <NA> 8480172
# 25001 <NA> <NA> <NA> <NA> 8477989 8476917 <NA> <NA> <NA>
# 30001 <NA> <NA> 8475791 <NA> <NA> <NA> <NA> <NA> 8476879
# 35001 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 8470105
# 40001 <NA> <NA> <NA> <NA> <NA> <NA> 8476905 <NA> <NA>
# 45001 <NA> <NA> <NA> <NA> <NA> <NA> 8474152 <NA> <NA>
# 45551 8474091 8477018 <NA> 8469608 <NA> <NA> <NA> 8473432 <NA>
# 46786 8478402 8471729 <NA> 8471239 <NA> <NA> <NA> 8477934 <NA>
# 47329 8477407 8475246 <NA> 8475622 <NA> <NA> <NA> 8477943 <NA>
# 50001 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 8470642
# 55001 <NA> <NA> <NA> <NA> 8474683 8479325 <NA> <NA> <NA>
# 60001 <NA> <NA> <NA> <NA> <NA> <NA> 8475218 <NA> <NA>
# 65001 <NA> <NA> 8471296 <NA> <NA> <NA> <NA> <NA> 8476851
# 70001 <NA> <NA> <NA> 8469514 <NA> <NA> <NA> <NA> 8476874
Upvotes: 2
Reputation: 47350
We can usetidyverse::spread
with reduce
, after some preprocessing as the same row can contain several values for Assist
:
library(tidyverse)
df %>%
rownames_to_column %>%
mutate(pType1 = gsub("Assist","Assist1",pType1),
pType2 = gsub("Assist","Assist2",pType2)) %>%
reduce(.init= .,.x=1:4,~spread(.,3,2))
# rowname Blocker Hitter PlayerID Scorer Assist1 Hittee Shooter Assist2 Goalie <NA>
# 1 1 8470828 NA NA NA NA NA 8475172 NA NA NA
# 2 10001 8470966 NA NA NA NA NA 8474162 NA NA NA
# 3 15001 NA NA NA NA NA NA NA NA NA NA
# 4 20001 8476472 NA NA NA NA NA 8480172 NA NA NA
# 5 25001 NA 8476917 NA NA NA 8477989 NA NA NA NA
# 6 30001 8475791 NA NA NA NA NA 8476879 NA NA NA
# 7 35001 NA NA NA NA NA NA NA NA NA NA
# 8 40001 NA NA 8476905 NA NA NA NA NA NA NA
# 9 45001 NA NA 8474152 NA NA NA NA NA NA NA
# 10 45551 NA NA NA 8473432 8474091 NA NA 8477018 8469608 NA
# 11 46786 NA NA NA 8477934 8478402 NA NA 8471729 8471239 NA
# 12 47329 NA NA NA 8477943 8477407 NA NA 8475246 8475622 NA
# 13 50001 NA NA NA NA NA NA NA NA NA NA
# 14 5001 NA NA NA NA NA NA NA NA NA NA
# 15 55001 NA 8479325 NA NA NA 8474683 NA NA NA NA
# 16 60001 NA NA 8475218 NA NA NA NA NA NA NA
# 17 65001 8471296 NA NA NA NA NA 8476851 NA NA NA
# 18 70001 NA NA NA NA NA NA NA NA NA NA
Upvotes: 1
Reputation: 10481
Not the creative answer I was looking for, but this technically works for me:
my.player.data %>%
dplyr::mutate(Shooter = ifelse(pType == "Shooter", p_id, ifelse(pType1 == "Shooter", p_id1, ifelse(pType2 == "Shooter", p_id2, ifelse(pType3 == "Shooter", p_id3, NA))))) %>%
dplyr::mutate(Goalie = ifelse(pType == "Goalie", p_id, ifelse(pType1 == "Goalie", p_id1, ifelse(pType2 == "Goalie", p_id2, ifelse(pType3 == "Goalie", p_id3, NA))))) %>%
dplyr::mutate(Blocker = ifelse(pType == "Blocker", p_id, ifelse(pType1 == "Blocker", p_id1, ifelse(pType2 == "Blocker", p_id2, ifelse(pType3 == "Blocker", p_id3, NA)))))
dplyr::mutate(Assister1 = ifelse(pType1 == "Assist", p_id1, NA)) %>%
dplyr::mutate(Assister1 = ifelse(pType2 == "Assist", p_id2, NA))
Upvotes: 0