Reputation: 351
I have a data frame of conditions:
df <- data.frame(Gender=c("Male", "Female", "Other"), AgeCat=c("young", "adult", "old"))
df
Gender AgeCat
1 Male young
2 Female adult
3 Other old
And I have a "master" data frame from which I need to extract data based on the conditions contained in the previous data frame:
master <- data.frame(Country=c("US", "US", "ENG", "SPN", "MEX", "ARG"), Gender=c("Male", "Male", "Male", "Female", "Female", "Female"), AgeCat=c("young", "young", "old", "adult", "adult", "adult"), Height=c("134", "125", "169", "189", "176", "191"))
master
Country Gender AgeCat Height
1 US Male young 134
2 US Male young 125
3 ENG Male old 169
4 SPN Female adult 189
5 MEX Female adult 176
6 ARG Female adult 191
Using one of the function from the apply
family, I am trying to create a list of subset of master
based on the row of df
.
In order to obtain a list like myList
below:
Male_young <- subset(master, Gender=="Male" & AgeCat=="young")
Male_young
Country Gender AgeCat Height
1 US Male young 134
2 US Male young 125
Female_adult <- subset(master, Gender=="Female" & AgeCat=="adult")
Female_adult
Country Gender AgeCat Height
4 SPN Female adult 189
5 MEX Female adult 176
6 ARG Female adult 191
Other_adult <- subset(master, Gender=="Other" & AgeCat=="old")
Other_adult
[1] Country Gender AgeCat Height
<0 rows> (or 0-length row.names)
myList <- list(Male_young=Male_young, Female_adult=Female_adult, Other_old=Other_old)
myList
$Male_young
Country Gender AgeCat Height
1 US Male young 134
2 US Male young 125
$Female_adult
Country Gender AgeCat Height
4 SPN Female adult 189
5 MEX Female adult 176
6 ARG Female adult 191
$Other_old
[1] Country Gender AgeCat Height
<0 rows> (or 0-length row.names)
EDIT I realised I wanted to show an example too simple maybe. What I am actually trying to obtain is a data frame (or a list that could be converted as a data frame or matrix) as follows:
Conditions Individuals Mean_Height
1 Male_young 2 129.5
2 Female_adult 3 185.3333
3 Other_old 0 .
EDIT 2 I keep persisting with this method but I am not sure it is very efficient:
df[c("Conditions", "Individuals", "Mean_Height")] <- t(
mapply(
function(X, Y, Z){
sub_df <- subset(X, Gender==Y & AgeCat==Z)
c(paste(Y, Z, sep="_"), nrow(sub_df), colMeans(sub_df$Height))
},
master,
df$Gender,
df$AgeCat
)
)
Upvotes: 1
Views: 67
Reputation: 652
I am responding for your first question and I'll think about your edit then. I would do that in a tidyverse way, using purrr and tidyr
df <- data.frame(Gender=c("Male", "Female", "Other"),
AgeCat=c("young", "adult", "old"),
stringsAsFactors = F)
master <- data.frame(Country=c("US", "US", "ENG", "SPN", "MEX", "ARG"),
Gender=c("Male", "Male", "Male", "Female", "Female", "Female"),
AgeCat=c("young", "young", "old", "adult", "adult", "adult"),
Height=c("134", "125", "169", "189", "176", "191"),
stringsAsFactors = F)
library(tidyverse)
pre <- master %>%
as_tibble(.) %>%
bind_rows(., df) %>%
complete(AgeCat, nesting(Country, Gender), fill = list(Height = NA)) %>%
group_by(Gender, AgeCat) %>%
nest(.)
NEST <- set_names(x = pre$data, nm = sprintf("%s_%s", pre$Gender, pre$AgeCat))
map(names(NEST), function(x){
z <- NEST[[x]]
z %>%
mutate(Gender = str_split(x, "_", simplify = T)[1],
AgeCat = str_split(x, "_", simplify = T)[2]) %>%
filter(!is.na(Country)) %>%
select(Country, Gender, AgeCat, Height)
}) %>%
set_names(., nm = sprintf("%s_%s", pre$Gender, pre$AgeCat))
#> $Female_adult
#> # A tibble: 3 x 4
#> Country Gender AgeCat Height
#> <chr> <chr> <chr> <chr>
#> 1 ARG Female adult 191
#> 2 MEX Female adult 176
#> 3 SPN Female adult 189
#>
#> $Male_adult
#> # A tibble: 2 x 4
#> Country Gender AgeCat Height
#> <chr> <chr> <chr> <chr>
#> 1 ENG Male adult <NA>
#> 2 US Male adult <NA>
#>
#> $Other_adult
#> # A tibble: 0 x 4
#> # … with 4 variables: Country <chr>, Gender <chr>, AgeCat <chr>, Height <chr>
#>
#> $Female_old
#> # A tibble: 3 x 4
#> Country Gender AgeCat Height
#> <chr> <chr> <chr> <chr>
#> 1 ARG Female old <NA>
#> 2 MEX Female old <NA>
#> 3 SPN Female old <NA>
#>
#> $Male_old
#> # A tibble: 2 x 4
#> Country Gender AgeCat Height
#> <chr> <chr> <chr> <chr>
#> 1 ENG Male old 169
#> 2 US Male old <NA>
#>
#> $Other_old
#> # A tibble: 0 x 4
#> # … with 4 variables: Country <chr>, Gender <chr>, AgeCat <chr>, Height <chr>
#>
#> $Female_young
#> # A tibble: 3 x 4
#> Country Gender AgeCat Height
#> <chr> <chr> <chr> <chr>
#> 1 ARG Female young <NA>
#> 2 MEX Female young <NA>
#> 3 SPN Female young <NA>
#>
#> $Male_young
#> # A tibble: 3 x 4
#> Country Gender AgeCat Height
#> <chr> <chr> <chr> <chr>
#> 1 ENG Male young <NA>
#> 2 US Male young 134
#> 3 US Male young 125
#>
#> $Other_young
#> # A tibble: 0 x 4
#> # … with 4 variables: Country <chr>, Gender <chr>, AgeCat <chr>, Height <chr>
Created on 2020-02-20 by the reprex package (v0.3.0)
EDIT: new answer comprising all the question
pre <- master %>%
as_tibble(.) %>%
mutate(Height = as.numeric(Height)) %>%
bind_rows(., df) %>%
complete(AgeCat, nesting(Country, Gender), fill = list(Height = NA)) %>%
group_by(Gender, AgeCat) %>%
nest(.)
NEST <- set_names(x = pre$data, nm = sprintf("%s_%s", pre$Gender, pre$AgeCat))
DF <- map(names(NEST), function(x){
z <- NEST[[x]]
z %>%
mutate(Gender = str_split(x, "_", simplify = T)[1],
AgeCat = str_split(x, "_", simplify = T)[2]) %>%
filter(!is.na(Country)) %>%
select(Country, Gender, AgeCat, Height)
}) %>%
set_names(., nm = sprintf("%s_%s", pre$Gender, pre$AgeCat)) %>%
bind_rows(., .id = "Conditions")
Count <- DF %>%
count(Conditions, name = "Individuals")
DF %>%
group_by(Conditions) %>%
filter(!is.na(Height)) %>%
summarise(Mean_Height = mean(Height, na.rm = T)) %>%
left_join(., Count) %>%
select(Conditions, Individuals, Mean_Height)
#> Joining, by = "Conditions"
#> # A tibble: 3 x 3
#> Conditions Individuals Mean_Height
#> <chr> <int> <dbl>
#> 1 Female_adult 3 185.
#> 2 Male_old 2 169
#> 3 Male_young 3 130.
Created on 2020-02-20 by the reprex package (v0.3.0)
EDIT2: I made slight changes to the resulting df in order to show the number of individuals in this case. As mean of NA doesn't exist it returns NaN
pre <- master %>%
as_tibble(.) %>%
mutate(Height = as.numeric(Height)) %>%
bind_rows(., df) %>%
complete(AgeCat, nesting(Country, Gender), fill = list(Height = NA)) %>%
group_by(Gender, AgeCat) %>%
nest(.)
NEST <- set_names(x = pre$data, nm = sprintf("%s_%s", pre$Gender, pre$AgeCat))
DF <- map(names(NEST), function(x){
z <- NEST[[x]]
z %>%
mutate(Gender = str_split(x, "_", simplify = T)[1],
AgeCat = str_split(x, "_", simplify = T)[2]) %>%
filter(!is.na(Country)) %>%
select(Country, Gender, AgeCat, Height)
}) %>%
set_names(., nm = sprintf("%s_%s", pre$Gender, pre$AgeCat)) %>%
bind_rows(., .id = "Conditions")
DF %>%
group_by(Conditions) %>%
mutate(N = case_when(is.na(Height) ~ 0,
TRUE ~ 1)) %>%
summarise(Individuals = sum(N), Mean_Height = mean(Height, na.rm = T))
#> # A tibble: 6 x 3
#> Conditions Individuals Mean_Height
#> <chr> <dbl> <dbl>
#> 1 Female_adult 3 185.
#> 2 Female_old 0 NaN
#> 3 Female_young 0 NaN
#> 4 Male_adult 0 NaN
#> 5 Male_old 1 169
#> 6 Male_young 2 130.
Created on 2020-02-20 by the reprex package (v0.3.0)
Upvotes: 0
Reputation: 8117
Could it be, that you actually want this?
master <- data.frame(Country=c("US", "US", "ENG", "SPN", "MEX", "ARG"), Gender=c("Male", "Male", "Male", "Female", "Female", "Female"), AgeCat=c("young", "young", "old", "adult", "adult", "adult"), Height=c("134", "125", "169", "189", "176", "191"), stringsAsFactors = FALSE)
df <- data.frame(Gender=c("Male", "Female", "Other"), AgeCat=c("young", "adult", "old"), stringsAsFactors = FALSE)
master %>%
group_by(Gender, AgeCat) %>%
summarise(mean(as.numeric(Height))) %>%
full_join(df)
which results in
Gender AgeCat `mean(as.numeric(Height))`
<chr> <chr> <dbl>
1 Female adult 185.
2 Male old 169
3 Male young 130.
4 Other old NA
Upvotes: 1
Reputation: 8117
library(tidyverse)
df <- df %>%
mutate_if(is.factor, as.character)
master <- master %>%
mutate_if(is.factor, as.character)
joinList <- split(df, seq(nrow(df)))
lapply(joinList, function(df, master) master %>% semi_join(df), master = master)
This results in a list of dataframes:
$`1`
Country Gender AgeCat Height
1 US Male young 134
2 US Male young 125
$`2`
Country Gender AgeCat Height
1 SPN Female adult 189
2 MEX Female adult 176
3 ARG Female adult 191
$`3`
[1] Country Gender AgeCat Height
<0 rows> (or 0-length row.names)
Upvotes: 0