Reputation: 368
I have the following df bhs1
:
structure(list(bhs1_1 = c(NA, 1, NA, 2, 1, 2, 2, 2, 1, 2, 1,
2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2), bhs1_2 = c(NA,
2, NA, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2,
1, 1, 2, 2, 2), bhs1_3 = c(NA, 1, NA, 2, 2, 2, 2, 2, 2, 2, 1,
2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2), bhs1_4 = c(NA,
2, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,
1, 1, 2, 1, 1), bhs1_5 = c(NA, 1, NA, 1, 2, 2, 2, 2, 2, 2, 1,
2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1), bhs1_6 = c(NA,
1, NA, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, NA, 2, 1,
2, NA, 1, 1, 2), bhs1_7 = c(NA, 1, NA, 1, 2, 1, 1, 1, 1, 1, 2,
1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1), bhs1_8 = c(NA,
2, NA, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1,
2, 1, 2, 2, 2), bhs1_9 = c(NA, 1, NA, 2, 1, 1, 1, 1, 2, 1, 2,
1, 1, 1, NA, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2), bhs1_10 = c(NA,
2, NA, 1, 2, 2, 2, 2, 1, 2, 1, 1, NA, 2, 1, 1, 1, 2, 1, 2, 2,
2, 2, 1, 1, 2), bhs1_11 = c(NA, 2, NA, 2, 2, 1, 1, 1, 2, 1, 1,
1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 2, 2, 1), bhs1_12 = c(NA,
2, NA, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
1, 2, 2, 1, 1), bhs1_13 = c(NA, 1, NA, 1, 2, 2, 2, 2, 1, 1, 1,
2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2), bhs1_14 = c(NA,
2, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
1, 1, 2, 2, 1), bhs1_15 = c(NA, 1, NA, 2, 2, 2, 2, 2, 2, 1, 2,
2, 2, 2, 1, 1, 2, 2, 2, NA, 2, 2, 2, 1, 2, 2), bhs1_16 = c(NA,
2, NA, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1,
1, 2, 2, 2, 2), bhs1_17 = c(NA, 2, NA, 2, 2, 1, 1, 1, 2, 1, 1,
1, 1, 1, 2, 2, 1, NA, 2, 2, 1, 1, 1, 2, 2, 2), bhs1_18 = c(NA,
1, NA, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
1, 1, 2, 1, 1), bhs1_19 = c(NA, 1, NA, 2, 1, 2, 2, 2, 1, 2, 2,
2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2), bhs1_20 = c(NA,
2, NA, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 2, 2, 2)), row.names = c(NA, -26L), class = c("tbl_df",
"tbl", "data.frame"))
bhs1
was passed through as_tibble()
.
I am interested in calculating a new variable bhs1$total
, that is the prorated mean across variables bhs1_1:bhs1_20
. The reason for prorating means is so that observations with missing values can be compared to observations without missing values. At the risk of stating the obvious:
NAs
and, then divide by 18 to have a mean comparable to,I am aware of (although do not yet know how to execute) multiple imputation, but I do not want to use this for this exercise.
I have tried the following code:
# A tibble: 908 x 21
bhs1_1 bhs1_2 bhs1_3 bhs1_4 bhs1_5 bhs1_6 bhs1_7 bhs1_8 bhs1_9
* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 NA NA NA NA NA NA NA NA NA
2 1 2 1 2 1 1 1 2 1
3 NA NA NA NA NA NA NA NA NA
4 2 2 2 1 1 2 1 2 2
5 1 1 2 1 2 1 2 2 1
6 2 1 2 1 2 2 1 2 1
7 2 2 2 1 2 2 1 2 1
8 2 1 2 1 2 2 1 2 1
9 1 2 2 1 2 2 1 1 2
10 2 2 2 1 2 1 1 2 1
# ... with 898 more rows, and 12 more variables: bhs1_10 <dbl>,
# bhs1_11 <dbl>, bhs1_12 <dbl>, bhs1_13 <dbl>, bhs1_14 <dbl>,
# bhs1_15 <dbl>, bhs1_16 <dbl>, bhs1_17 <dbl>, bhs1_18 <dbl>,
# bhs1_19 <dbl>, bhs1_20 <dbl>, meanTest <dbl>
Which works as expected. However, when I enter the column names instead of the column numbers, it fails:
> bhs1$meanTest <- rowMeans(bhs1[,"bhs1_1":"bhs1_20"], na.rm=TRUE)
Error in "bhs1_1":"bhs1_20" : NA/NaN argument
5. check_names_df(j, x)
4. `[.tbl_df`(bhs1, , "bhs1_1":"bhs1_20")
3. bhs1[, "bhs1_1":"bhs1_20"]
2. is.data.frame(x)
1. rowMeans(bhs1[, "bhs1_1":"bhs1_20"], na.rm = TRUE)
I think it is much more straight forward to use variable / column names instead of column numbers. Is there an elegant way to write code to meet this use case? If so, can someone please point me in the right direction and/or provide me with a sample?
Thank you in advance for your consideration and help.
Upvotes: 1
Views: 2155
Reputation: 887223
We need to get a vector of names
nm1 <- paste0("bhs1_", 1:20)
bhs1$meanTest <- rowMeans(bhs1[nm1], na.rm = TRUE)
If the names are not having a pattern and we are interested to subset between the start and stop names, then use match
to get the column index, get a sequence (:
)
nm1 <- Reduce(`:`, match(c("bhs1_1", "bhs1_20"), names(bhs1)))
and use that in rowMeans
Or if we are using the tidyverse
, we can specify the range (:
)
bhs1 <- bhs1 %>%
select(bhs1_1:bhs1_20) %>% #can replace with 'nm1'
rowMeans(., na.rm = TRUE) %>%
bind_cols(bhs1, meanTest = .)
If we need to do the rowMeans
on multiple sets of columns, use the map2
from purrr
by either passing it as a list of quosures for the 'start' and 'end' column names in map2
and then do the select
by evaluating (!!
) the quosures to get the rowMeans
afterwards
library(purrr)
map2_df(quos(bhs1_1, bhs1_4), quos(bhs1_5, bhs1_8), ~
bhs1 %>%
select((!! .x) : (!! .y)) %>%
rowMeans(., na.rm = TRUE)) %>%
bind_cols(bhs1, .)
Or we can pass as a vector of strings and then do the conversion to symbol (with sym
from rlang
) and evaluate (!!
)
map2(c("bhs1_1", "bhs1_4"), c("bhs1_5", "bhs1_8"), ~
bhs1 %>%
select(!!(rlang::sym(.x)): !!(rlang::sym(.y)))) %>%
rowMeans(., na.rm = TRUE)) %>%
bind_cols(bhs1, .)
Upvotes: 3