How do you calculate rowMeans by column name, not column number

Question

I have the following df bhs1:

structure(list(bhs1_1 = c(NA, 1, NA, 2, 1, 2, 2, 2, 1, 2, 1, 
2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2), bhs1_2 = c(NA, 
2, NA, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 
1, 1, 2, 2, 2), bhs1_3 = c(NA, 1, NA, 2, 2, 2, 2, 2, 2, 2, 1, 
2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2), bhs1_4 = c(NA, 
2, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 
1, 1, 2, 1, 1), bhs1_5 = c(NA, 1, NA, 1, 2, 2, 2, 2, 2, 2, 1, 
2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1), bhs1_6 = c(NA, 
1, NA, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, NA, 2, 1, 
2, NA, 1, 1, 2), bhs1_7 = c(NA, 1, NA, 1, 2, 1, 1, 1, 1, 1, 2, 
1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1), bhs1_8 = c(NA, 
2, NA, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 
2, 1, 2, 2, 2), bhs1_9 = c(NA, 1, NA, 2, 1, 1, 1, 1, 2, 1, 2, 
1, 1, 1, NA, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2), bhs1_10 = c(NA, 
2, NA, 1, 2, 2, 2, 2, 1, 2, 1, 1, NA, 2, 1, 1, 1, 2, 1, 2, 2, 
2, 2, 1, 1, 2), bhs1_11 = c(NA, 2, NA, 2, 2, 1, 1, 1, 2, 1, 1, 
1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 2, 2, 1), bhs1_12 = c(NA, 
2, NA, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 
1, 2, 2, 1, 1), bhs1_13 = c(NA, 1, NA, 1, 2, 2, 2, 2, 1, 1, 1, 
2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2), bhs1_14 = c(NA, 
2, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 
1, 1, 2, 2, 1), bhs1_15 = c(NA, 1, NA, 2, 2, 2, 2, 2, 2, 1, 2, 
2, 2, 2, 1, 1, 2, 2, 2, NA, 2, 2, 2, 1, 2, 2), bhs1_16 = c(NA, 
2, NA, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 
1, 2, 2, 2, 2), bhs1_17 = c(NA, 2, NA, 2, 2, 1, 1, 1, 2, 1, 1, 
1, 1, 1, 2, 2, 1, NA, 2, 2, 1, 1, 1, 2, 2, 2), bhs1_18 = c(NA, 
1, NA, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 
1, 1, 2, 1, 1), bhs1_19 = c(NA, 1, NA, 2, 1, 2, 2, 2, 1, 2, 2, 
2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2), bhs1_20 = c(NA, 
2, NA, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 
1, 1, 2, 2, 2)), row.names = c(NA, -26L), class = c("tbl_df", 
"tbl", "data.frame"))

bhs1 was passed through as_tibble().

I am interested in calculating a new variable bhs1$total, that is the prorated mean across variables bhs1_1:bhs1_20. The reason for prorating means is so that observations with missing values can be compared to observations without missing values. At the risk of stating the obvious:

observation 1 may only have data on 18 variables, so I need to sum the recorded variables, ignore the NAs and, then divide by 18 to have a mean comparable to,
observation 2 who has data on 20 variables, who would have the summed recorded variables divided by 20.

I am aware of (although do not yet know how to execute) multiple imputation, but I do not want to use this for this exercise.

I have tried the following code:

# A tibble: 908 x 21
   bhs1_1 bhs1_2 bhs1_3 bhs1_4 bhs1_5 bhs1_6 bhs1_7 bhs1_8 bhs1_9
 *                  
 1     NA     NA     NA     NA     NA     NA     NA     NA     NA
 2      1      2      1      2      1      1      1      2      1
 3     NA     NA     NA     NA     NA     NA     NA     NA     NA
 4      2      2      2      1      1      2      1      2      2
 5      1      1      2      1      2      1      2      2      1
 6      2      1      2      1      2      2      1      2      1
 7      2      2      2      1      2      2      1      2      1
 8      2      1      2      1      2      2      1      2      1
 9      1      2      2      1      2      2      1      1      2
10      2      2      2      1      2      1      1      2      1
# ... with 898 more rows, and 12 more variables: bhs1_10 ,
#   bhs1_11 , bhs1_12 , bhs1_13 , bhs1_14 ,
#   bhs1_15 , bhs1_16 , bhs1_17 , bhs1_18 ,
#   bhs1_19 , bhs1_20 , meanTest

Which works as expected. However, when I enter the column names instead of the column numbers, it fails:

> bhs1$meanTest <- rowMeans(bhs1[,"bhs1_1":"bhs1_20"], na.rm=TRUE)

Error in "bhs1_1":"bhs1_20" : NA/NaN argument
   5. check_names_df(j, x)
   4. `[.tbl_df`(bhs1, , "bhs1_1":"bhs1_20")
   3. bhs1[, "bhs1_1":"bhs1_20"]
   2. is.data.frame(x)
   1. rowMeans(bhs1[, "bhs1_1":"bhs1_20"], na.rm = TRUE)

I think it is much more straight forward to use variable / column names instead of column numbers. Is there an elegant way to write code to meet this use case? If so, can someone please point me in the right direction and/or provide me with a sample?

Thank you in advance for your consideration and help.

akrun · Accepted Answer

We need to get a vector of names

nm1 <- paste0("bhs1_", 1:20)
bhs1$meanTest <- rowMeans(bhs1[nm1], na.rm = TRUE)

If the names are not having a pattern and we are interested to subset between the start and stop names, then use match to get the column index, get a sequence (:)

nm1 <- Reduce(`:`, match(c("bhs1_1", "bhs1_20"), names(bhs1)))

and use that in rowMeans

Or if we are using the tidyverse, we can specify the range (:)

bhs1 <- bhs1 %>%
          select(bhs1_1:bhs1_20) %>% #can replace with 'nm1'
          rowMeans(., na.rm = TRUE) %>%
          bind_cols(bhs1, meanTest = .)

If we need to do the rowMeans on multiple sets of columns, use the map2 from purrr by either passing it as a list of quosures for the 'start' and 'end' column names in map2 and then do the select by evaluating (!!) the quosures to get the rowMeans afterwards

library(purrr)
map2_df(quos(bhs1_1, bhs1_4), quos(bhs1_5, bhs1_8), ~
         bhs1 %>%
           select((!! .x) : (!! .y)) %>% 
           rowMeans(., na.rm = TRUE)) %>%
   bind_cols(bhs1, .)

Or we can pass as a vector of strings and then do the conversion to symbol (with sym from rlang) and evaluate (!!)

map2(c("bhs1_1", "bhs1_4"), c("bhs1_5", "bhs1_8"), ~ 
           bhs1 %>%
              select(!!(rlang::sym(.x)): !!(rlang::sym(.y)))) %>%
              rowMeans(., na.rm = TRUE)) %>%
    bind_cols(bhs1, .)

How do you calculate rowMeans by column name, not column number

Answers (1)

Related Questions