Reputation: 61
I'd like to select only Game2 column from all the cities for all the Names and add a new column to indicate which cities these are from like below:
Name | Number | Game2 | City |
---|---|---|---|
Anna | 2453 | 11 | Toronto |
Anna | 2453 | 10 | Montreal |
Anna | 2453 | 10 | Vancouver |
Louisa | 6532 | 23 | Toronto |
Louisa | 6532 | 8 | Montreal |
Louisa | 6532 | 9 | Vancouver |
... | ... | ... | ... |
This is what the df looks like in csv
And below is how R reads it with the
structure(list(NA. = c("Name", "Anna", "Louisa", "Cheryl", "Amanda",
"Kathleen", "Aisha", "Solange", "Mary", "Jameela"), NA..1 = c("Number",
"2453", "6532", "3467", "9836", "1784", "7776", "9867", "3726",
"2289"), Toronto = c("Game1", "12", "6", "7", "10", "12", "9",
"11", "9", "10"), NA..2 = c("Game2", "11", "23", "10", "9", "7",
"13", "8", "10", "9"), NA..3 = c("Game3", "5", "11", "10", "8",
"9", "10", "10", "8", "9"), Montreal = c("Game1", "7", "13",
"14", "7", "9", "13", "9", "7", "11"), NA..4 = c("Game2", "10",
"8", "10", "9", "10", "11", "7", "8", "10"), NA..5 = c("Game3",
"6", "12", "11", "9", "9", "10", "12", "8", "12"), Vancouver = c("Game1",
"11", "10", "9", "7", "8", "8", "9", "6", "10"), NA..6 = c("Game2",
"10", "9", "9", "9", "8", "10", "10", "9", "10"), NA..7 = c("Game3",
"9", "10", "11", "11", "10", "12", "11", "8", "12")), class = "data.frame", row.names = c(NA,
-10L))
Upvotes: 0
Views: 45
Reputation: 2288
you can "subset" your data frame and assign the names, etc. "by hand". The subsetting notation ``df[rows, cols]` allows you to specify which part of the rows/cols you want to use. Obviously, you have nested column names - which is not ideal.
library(dplyr) # using the tidyverse for data crunching
df_Toronto <- df[2:nrow(df),1:5] # take 2nd row till end and cols 1 through 5
my_names <- c("Name", "Number", "Game1","Game2","Game3")
names(df_Toronto) <- my_names
df_Toronto <- df_Toronto %>%
select(Name, Number, Game2) %>% # pick your desired columns
mutate(City = "Toronto") # add a new city column
## do the same for Montreal and Vancouver
## combine all dataframes
df <- bind_rows(df_Toronto, df_Montreal, df_Vancouver)
If your data set is significantly larger, you can turn this into a function.
Upvotes: 0
Reputation: 161110
First, save the cities and remove them from the data.
cities <- colnames(dat)
colnames(dat) <- unlist(dat[1,])
dat <- dat[-1,]
cities <- gsub("NA\\.*[0-9]?", "", cities)
cities
# [1] "" "" "Toronto" "" "" "Montreal" "" "" "Vancouver" ""
# [11] ""
cities <- ave(cities, cumsum(nzchar(cities)), FUN = function(z) z[1])
cities
# [1] "" "" "Toronto" "Toronto" "Toronto" "Montreal" "Montreal" "Montreal" "Vancouver" "Vancouver"
# [11] "Vancouver"
colnames(dat) <- paste0(cities, ifelse(nzchar(cities), "_", ""), colnames(dat))
At this point, you should know that all of your number-like columns are still strings. If you want to fix this, you can do dat[] <- lapply(dat, type.convert, as.is = TRUE)
.
Now, we need to reshape/pivot to long to be able to safely break-apart city from the game.
datlong <- within(reshape2::melt(dat, c("Name", "Number")), {
City = gsub("_.*", "", variable)
game = gsub(".*_", "", variable)
variable = NULL
})
head(datlong)
# Name Number value game City
# 1 Anna 2453 12 Game1 Toronto
# 2 Louisa 6532 6 Game1 Toronto
# 3 Cheryl 3467 7 Game1 Toronto
# 4 Amanda 9836 10 Game1 Toronto
# 5 Kathleen 1784 12 Game1 Toronto
# 6 Aisha 7776 9 Game1 Toronto
and then reshape again ti break each game into its own column (I'm inferring that's you're preference).
reshape2::dcast(datlong, Name + Number + City ~ game)
# Name Number City Game1 Game2 Game3
# 1 Aisha 7776 Montreal 13 11 10
# 2 Aisha 7776 Toronto 9 13 10
# 3 Aisha 7776 Vancouver 8 10 12
# 4 Amanda 9836 Montreal 7 9 9
# 5 Amanda 9836 Toronto 10 9 8
# 6 Amanda 9836 Vancouver 7 9 11
# 7 Anna 2453 Montreal 7 10 6
# 8 Anna 2453 Toronto 12 11 5
# 9 Anna 2453 Vancouver 11 10 9
# 10 Cheryl 3467 Montreal 14 10 11
# 11 Cheryl 3467 Toronto 7 10 10
# 12 Cheryl 3467 Vancouver 9 9 11
# 13 Jameela 2289 Montreal 11 10 12
# 14 Jameela 2289 Toronto 10 9 9
# 15 Jameela 2289 Vancouver 10 10 12
# 16 Kathleen 1784 Montreal 9 10 9
# 17 Kathleen 1784 Toronto 12 7 9
# 18 Kathleen 1784 Vancouver 8 8 10
# 19 Louisa 6532 Montreal 13 8 12
# 20 Louisa 6532 Toronto 6 23 11
# 21 Louisa 6532 Vancouver 10 9 10
# 22 Mary 3726 Montreal 7 8 8
# 23 Mary 3726 Toronto 9 10 8
# 24 Mary 3726 Vancouver 6 9 8
# 25 Solange 9867 Montreal 9 7 12
# 26 Solange 9867 Toronto 11 8 10
# 27 Solange 9867 Vancouver 9 10 11
Starting after colnames(dat) <- paste0(.)
from above (since that is not as easily dplyr-ized);
head(dat)
# Name Number Toronto_Game1 Toronto_Game2 Toronto_Game3 Montreal_Game1 Montreal_Game2 Montreal_Game3 Vancouver_Game1 Vancouver_Game2 Vancouver_Game3
# 2 Anna 2453 12 11 5 7 10 6 11 10 9
# 3 Louisa 6532 6 23 11 13 8 12 10 9 10
# 4 Cheryl 3467 7 10 10 14 10 11 9 9 11
# 5 Amanda 9836 10 9 8 7 9 9 7 9 11
# 6 Kathleen 1784 12 7 9 9 10 9 8 8 10
# 7 Aisha 7776 9 13 10 13 11 10 8 10 12
Then continuing:
library(dplyr)
library(tidyr) # pivot_*, separate
dat %>%
pivot_longer(-c(Name, Number)) %>%
separate(name, c("City", "game"), sep = "_") %>%
pivot_wider(c(Name, Number, City), names_from = game, values_from = value)
# # A tibble: 27 x 6
# Name Number City Game1 Game2 Game3
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 Anna 2453 Toronto 12 11 5
# 2 Anna 2453 Montreal 7 10 6
# 3 Anna 2453 Vancouver 11 10 9
# 4 Louisa 6532 Toronto 6 23 11
# 5 Louisa 6532 Montreal 13 8 12
# 6 Louisa 6532 Vancouver 10 9 10
# 7 Cheryl 3467 Toronto 7 10 10
# 8 Cheryl 3467 Montreal 14 10 11
# 9 Cheryl 3467 Vancouver 9 9 11
# 10 Amanda 9836 Toronto 10 9 8
# # ... with 17 more rows
Upvotes: 1