Reputation: 1708
I want to know if there is a better way of doing what I am doing.
I have a tibble (sample here):
library(tidyverse)
library(Hmisc) # for the weighted values
df2 <- structure(list(Q31_A_1 = c(9L, 3L, 2L, 2L, 2L, 3L, 5L, 3L, 1L,
3L, 4L, 4L, 1L, 3L, 9L, 2L, 4L, 2L, 3L, 2L, 9L, 2L, 4L, 3L, 3L,
3L, 9L, 2L, 3L, NA), Q31_A_2 = c(9L, 4L, 2L, 2L, 2L, 3L, 4L,
3L, 1L, 3L, 5L, 4L, 1L, 3L, 9L, 2L, 3L, 2L, 3L, 9L, 9L, 2L, 4L,
3L, 3L, 3L, 4L, 2L, 3L, NA), Q31_A_3 = c(9L, 4L, 2L, 2L, 2L,
3L, NA, 3L, 1L, 3L, NA, 4L, 1L, 2L, 9L, 2L, 3L, 2L, 2L, 2L, 9L,
2L, 4L, 3L, 3L, 2L, 3L, 2L, 2L, 2L), Q31_A_4 = c(9L, 3L, 2L,
2L, NA, 3L, 4L, 3L, 3L, 3L, 5L, 4L, 3L, 3L, 4L, NA, 4L, 2L, 3L,
9L, 9L, 2L, 4L, 3L, 4L, 4L, 9L, 2L, 3L, 2L), Q31_B_1 = c(9L,
2L, 2L, 2L, 1L, 2L, 9L, 3L, NA, 3L, 4L, 4L, 2L, 9L, 9L, NA, 9L,
2L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 4L), Q31_B_2 = c(9L,
9L, 9L, 2L, 1L, 2L, 9L, 3L, 1L, 3L, 4L, 9L, 2L, 9L, 9L, 2L, 9L,
2L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 4L), Q31_B_3 = c(9L,
9L, 9L, 2L, 1L, 2L, 9L, 3L, NA, 3L, 4L, 9L, 1L, 9L, 9L, NA, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 3L), ages = c("50-64 years",
"35-49 years", "35-49 years", "50-64 years", "65+ years", "65+ years",
"65+ years", "65+ years", "65+ years", "65+ years", "65+ years",
"35-49 years", "65+ years", "50-64 years", "65+ years", "65+ years",
"50-64 years", "35-49 years", "65+ years", "65+ years", "65+ years",
"65+ years", "65+ years", "50-64 years", "50-64 years", "50-64 years",
"50-64 years", "65+ years", "50-64 years", "35-49 years"), wt = c(0.64708755364565,
0.921064359620811, 1.3907697993331, 0.974726729781105, 0.576703486333466,
0.489053964840285, 0.489053964840285, 0.576703486333466, 0.576703486333466,
0.489053964840285, 0.489053964840285, 0.921064359620811, 0.489053964840285,
0.974726729781105, 0.489053964840285, 0.489053964840285, 0.64708755364565,
0.921064359620811, 0.489053964840285, 0.489053964840285, 0.576703486333466,
0.489053964840285, 0.576703486333466, 0.974726729781105, 0.64708755364565,
0.974726729781105, 0.974726729781105, 0.489053964840285, 0.974726729781105,
0.921064359620811)), row.names = c(NA, -30L), class = c("tbl_df",
"tbl", "data.frame"))
Which is this:
# A tibble: 30 x 9
Q31_A_1 Q31_A_2 Q31_A_3 Q31_A_4 Q31_B_1 Q31_B_2 Q31_B_3 ages wt
<int> <int> <int> <int> <int> <int> <int> <chr> <dbl>
1 9 9 9 9 9 9 9 50-64 years 0.647
2 3 4 4 3 2 9 9 35-49 years 0.921
3 2 2 2 2 2 9 9 35-49 years 1.39
4 2 2 2 2 2 2 2 50-64 years 0.975
5 2 2 2 NA 1 1 1 65+ years 0.577
6 3 3 3 3 2 2 2 65+ years 0.489
7 5 4 NA 4 9 9 9 65+ years 0.489
8 3 3 3 3 3 3 3 65+ years 0.577
9 1 1 1 3 NA 1 NA 65+ years 0.577
10 3 3 3 3 3 3 3 65+ years 0.489
# ... with 20 more rows
And I want to apply a function over the columns Q31_A_1 to Q31_B_3 (in the full dataset, there are a lot more columns and rows). This is data from a survey. I want to join the values to an index value:
index5 <- tibble(
int = 1:5,
factor = c(100, 75, 50, 25, 0))
This is done in the function:
group_scores2 <- function(field) {
field <- enquo(field)
df <- df2 %>% select(!!field, ages, wt) %>%
filter(UQ(field) <=5) %>%
mutate(int = as.integer(!!field))
df
df <- left_join(df,index5, by = "int",
copy=FALSE)
df
ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Overall") %>%
mutate(group = "Overall (2018)")
ag <- df %>%
group_by(ages) %>%
summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Age Group") %>%
rename(group = ages)
combined <- bind_rows(ov, ag)
}
Which for example when I run this:
group_scores2(Q31_A_1)
This is the output.
# A tibble: 4 x 6
mean var sd count cat group
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 56.2 514. 22.7 17.5 Overall Overall (2018)
2 58.4 548. 23.4 4.15 Age Group 35-49 years
3 51.3 194. 13.9 6.17 Age Group 50-64 years
4 59.1 894. 29.9 7.20 Age Group 65+ years
I have tried to use purrr
and the apply
family of functions, but can't seem to get it right. For example:
df3 <- df2 %>% gather(ind, value, Q31_A_1:Q31_B_3)
df3 %>% map(group_scores2)
Which returns an error.
I wouldn't know where to start with apply
.
I would like to know if there is a more effective way of doing this.
Upvotes: 0
Views: 4053
Reputation: 530
When you pass a data frame to map with df3 %>% map(group_scores2)
, the map
function tries to call group_scores2
with every column of df3
- I imagine that's not what you want to do with the gather
ed data frame?
Using purr::map
Modifying your function, since it should take a data.frame
, for instance. I wouldn't go this route for the task at hand since the function is relying on global variables (index5
) in a sort of undocumented way (and naming a variable factor
seems like it could cause trouble). But it can work; map_dfr
expects the function arg to return data frames which can be rbind
ed together, as per your intent.
group_scores3 <- function(ds) {
df = ds %>% filter(value <=5) %>%
rename(int = value) %>%
left_join(index5, by = "int",copy=FALSE)
ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt),
sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Overall") %>%
mutate(group = "Overall (2018)")
ag <- df %>%
group_by(ages) %>%
summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt),
sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Age Group") %>%
rename(group = ages)
bind_rows(ov, ag)
}
# df3 as before
df3 <- df2 %>% gather(ind, value, Q31_A_1:Q31_B_3)
# summarize each question and concatenate the results:
df3 %>% split(.$ind) %>%
map_dfr(.f = group_scores3,.id = "ind")
just dplyr verbs
More straightforward, perhaps, is good old fashioned split-apply-combine. I'm not sure whether there's a 'tidier' way to summarize by both age/question and just question; if you were doing many such summaries, perhaps the summarise
portion would be the part to factor out into a function and call via a purrr::map_***
.
# like df3, but take care of filter/merge once instead of repeating every time
df4 = df2 %>% gather(ind,value,Q31_A_1:Q31_B_3) %>%
filter(value <= 5) %>%
rename(int = value) %>%
inner_join(index5,by="int")
# scores per age group:
output1 = df4 %>%
group_by(ind,ages) %>%
summarise(mean = wtd.mean(factor, wt),
var = wtd.var(factor, wt),
sd = sqrt(var),
count = sum(wt)) %>%
mutate(category = "Age Group")
# overall scores:
output2 = df4 %>%
group_by(ind) %>%
summarise(mean = wtd.mean(factor, wt),
var = wtd.var(factor, wt),
sd = sqrt(var),
count = sum(wt)) %>%
mutate(category = "Overall")
bind_rows(output1,output2) %>%
mutate(ages = ifelse(is.na(ages),"Overall (2018)",ages)) %>%
arrange(ind,desc(category))
Both of these give me the same answer; the only change required to summarize a different subset of questions is in the gather
call.
Upvotes: 1
Reputation: 39184
Here is a workaround. First, I re-wrote your function as group_scores3
, which achieves the same thing but replace some part with base R syntex. I also added a column in the final output to show which column was the input column from df2
.
group_scores3 <- function(field) {
# The following four lines do the same things as the first chunk in your function
df <- df2[, c(field, "ages", "wt")]
df <- df[df[[field]] <= 5 & !is.na(df[[field]]), ]
df$int = as.integer(df[[field]])
df <- left_join(df, index5, by = "int", copy=FALSE)
ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Overall") %>%
mutate(group = "Overall (2018)")
ag <- df %>%
group_by(ages) %>%
summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Age Group") %>%
rename(group = ages)
combined <- bind_rows(ov, ag)
# Add a column to show which question
combined$Q <- field
return(combined)
}
And then I created a vector showing all the target names.
# Create a vector with target column names
cols <- str_subset(names(df2), "^Q")
Finally, we can use map_dfr
to loop through the columns. The output is a data frame with all individual outputs combined. Notice that the warning message is not from map_dfr
, but from some of the individual columns when using group_scores3
.
# Perform the analysis
map_dfr(cols, ~group_scores3(.))
# A tibble: 28 x 7 mean var sd count cat group Q <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> 1 56.2 514. 22.7 17.5 Overall Overall (2018) Q31_A_1 2 58.4 548. 23.4 4.15 Age Group 35-49 years Q31_A_1 3 51.3 194. 13.9 6.17 Age Group 50-64 years Q31_A_1 4 59.1 894. 29.9 7.20 Age Group 65+ years Q31_A_1 5 53.6 553. 23.5 18.0 Overall Overall (2018) Q31_A_2 6 52.8 813. 28.5 4.15 Age Group 35-49 years Q31_A_2 7 50 198. 14.1 7.14 Age Group 50-64 years Q31_A_2 8 57.9 947. 30.8 6.71 Age Group 65+ years Q31_A_2 9 63.4 414. 20.4 18.4 Overall Overall (2018) Q31_A_3 10 56.9 720. 26.8 5.08 Age Group 35-49 years Q31_A_3 # ... with 18 more rows Warning messages: 1: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined 2: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined 3: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined 4: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined
Upvotes: 1