Reputation: 1040
I have data such as this:
data_in <- read_table2("ID Q26_1_1 Q26_1_2 Q26_1_3 Q26_1_4 Q26_2_1 Q26_2_2 Q26_2_3 Q26_2_4 Q26_3_1 Q26_3_2 Q26_3_3 Q26_3_4 Q26_4_1 Q26_4_2 Q26_4_3 Q26_4_4 Q26_5_1 Q26_5_2 Q26_5_3 Q26_5_4 Q14_1_1 Q14_1_2 Q14_1_3 Q14_1_4 Q14_1_5 Q14_1_6 Q14_2_1 Q14_2_2 Q14_2_3 Q14_2_4 Q14_2_5 Q14_2_6 Q14_3_1 Q14_3_2 Q14_3_3 Q14_3_4 Q14_3_5 Q14_3_6 Q14_4_1 Q14_4_2 Q14_4_3 Q14_4_4 Q14_4_5 Q14_4_6 Q14_5_1 Q14_5_2 Q14_5_3 Q14_5_4 Q14_5_5 Q14_5_6
1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA 1 1 1 NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA 1
2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1
3 1 NA 1 NA 1 NA 1 NA 1 NA NA NA NA NA NA 1 1 1 1 NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA 1 1 NA NA NA NA NA NA NA NA 1 NA NA 1 NA NA NA
4 NA NA NA 1 NA NA NA 1 NA NA NA 1 NA NA NA 1 NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1
5 1 NA NA NA NA 1 1 NA 1 NA NA NA 1 NA NA NA 1 1 1 NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA NA NA 1 NA NA NA
")
I would like to manipulate the data as so: if the sum of series of questions is equal to 0, then assign missing to that entire series, if it is not equal to 0, then assign "0". I have already written the code to manipulate this data.
The problem is that I have many series of questions, and I would like to write this into a function, so that I can apply it to each question series, so I don't need to keep copy pasting like this:
data_out <-
data_in %>%
### Q26 ####
# Convert the Q26 series missing to zero
mutate(across(matches("Q26"), ~replace_na(., 0))) %>%
mutate_if(is.character, as.numeric) %>%
# Q26_1
mutate(sum_Q26_1=rowSums(select(.,matches(("^Q26_1_[1-4]$"))), na.rm = T)) %>%
mutate(across(matches("^Q26_1_[1-4]$"),.fns=~case_when(sum_Q26_1==0~"NA",
TRUE~as.character(.)))) %>%
# Q26_2
mutate(sum_Q26_2=rowSums(select(.,matches(("^Q26_2_[1-4]$"))), na.rm = T)) %>%
mutate(across(matches("^Q26_2_[1-4]$"),.fns=~case_when(sum_Q26_2==0~"NA",
TRUE~as.character(.)))) %>%
# Q26_3
mutate(sum_Q26_3=rowSums(select(.,matches(("^Q26_3_[1-4]$"))), na.rm = T)) %>%
mutate(across(matches("^Q26_3_[1-4]$"),.fns=~case_when(sum_Q26_3==0~"NA",
TRUE~as.character(.)))) %>%
# Q26_4
mutate(sum_Q26_4=rowSums(select(.,matches(("^Q26_4_[1-4]$"))), na.rm = T)) %>%
mutate(across(matches("^Q26_4_[1-4]$"),.fns=~case_when(sum_Q26_4==0~"NA",
TRUE~as.character(.)))) %>%
# Q26_5
mutate(sum_Q26_5=rowSums(select(.,matches(("^Q26_5_[1-4]$"))), na.rm = T)) %>%
mutate(across(matches("^Q26_5_[1-4]$"),.fns=~case_when(sum_Q26_5==0~"NA",
TRUE~as.character(.)))) %>%
### Q14 ####
mutate(across(matches("Q14"), ~replace_na(., 0))) %>%
mutate_if(is.character, as.numeric) %>%
# Q14_1
mutate(sum_Q14_1=rowSums(select(.,matches(("^Q14_1_[1-6]$"))), na.rm = T)) %>%
mutate(across(matches("^Q14_1_[1-6]$"),.fns=~case_when(sum_Q14_1==0~"NA",
TRUE~as.character(.)))) %>%
# Q14_2
mutate(sum_Q14_2=rowSums(select(.,matches(("^Q14_2_[1-6]$"))), na.rm = T)) %>%
mutate(across(matches("^Q14_2_[1-6]$"),.fns=~case_when(sum_Q14_2==0~"NA",
TRUE~as.character(.)))) %>%
# Q14_3
mutate(sum_Q14_3=rowSums(select(.,matches(("^Q14_3_[1-6]$"))), na.rm = T)) %>%
mutate(across(matches("^Q14_3_[1-6]$"),.fns=~case_when(sum_Q14_3==0~"NA",
TRUE~as.character(.)))) %>%
# Q14_4
mutate(sum_Q14_4=rowSums(select(.,matches(("^Q14_4_[1-6]$"))), na.rm = T)) %>%
mutate(across(matches("^Q14_4_[1-6]$"),.fns=~case_when(sum_Q14_4==0~"NA",
TRUE~as.character(.)))) %>%
# Q14_5
mutate(sum_Q14_5=rowSums(select(.,matches(("^Q14_5_[1-6]$"))), na.rm = T)) %>%
mutate(across(matches("^Q14_5_[1-6]$"),.fns=~case_when(sum_Q14_5==0~"NA",
TRUE~as.character(.))))
Notice that the regex pattern changes over the questions. For example: Q26 series is matches(("^Q26_1_[1-4]$")))
whereas, Q14 series is matches(("^Q14_1_[1-6]$")))
I don't know to wrap this into a function given the changing regex patterns. Any suggestions?
Upvotes: 0
Views: 46
Reputation: 8844
Based on my observation, I assume that
sum_...
columns are just temporary variables, and I can freely drop them if I want;mutate_if(is.character, as.numeric)
later in your pipeline, you actually want each column as.character
and only contains "0"
, "1"
or NA
;sum_Q14_4==0~"NA"
, you intend to assign it a missing value of a character type (which should be NA_character_
), not an "NA"
literal.Then, your code simplifies to:
library(dplyr)
library(tidyr)
data_in %>%
pivot_longer(-ID, names_to = c("qns", "sub1", "sub2"), names_pattern = "([^_]+)_([^_]+)_([^_]+)") %>%
group_by(ID, qns, sub1) %>%
mutate(value = if (all(is.na(value))) NA_character_ else replace_na(value, "0")) %>%
pivot_wider(names_from = c("qns", "sub1", "sub2"))
Output
# A tibble: 5 x 51
# Groups: ID [5]
ID Q26_1_1 Q26_1_2 Q26_1_3 Q26_1_4 Q26_2_1 Q26_2_2 Q26_2_3 Q26_2_4 Q26_3_1 Q26_3_2 Q26_3_3 Q26_3_4 Q26_4_1 Q26_4_2 Q26_4_3 Q26_4_4 Q26_5_1 Q26_5_2
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
3 3 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1 1
4 4 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0
5 5 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 1
# ... with 32 more variables: Q26_5_3 <chr>, Q26_5_4 <chr>, Q14_1_1 <chr>, Q14_1_2 <chr>, Q14_1_3 <chr>, Q14_1_4 <chr>, Q14_1_5 <chr>, Q14_1_6 <chr>,
# Q14_2_1 <chr>, Q14_2_2 <chr>, Q14_2_3 <chr>, Q14_2_4 <chr>, Q14_2_5 <chr>, Q14_2_6 <chr>, Q14_3_1 <chr>, Q14_3_2 <chr>, Q14_3_3 <chr>,
# Q14_3_4 <chr>, Q14_3_5 <chr>, Q14_3_6 <chr>, Q14_4_1 <chr>, Q14_4_2 <chr>, Q14_4_3 <chr>, Q14_4_4 <chr>, Q14_4_5 <chr>, Q14_4_6 <chr>,
# Q14_5_1 <chr>, Q14_5_2 <chr>, Q14_5_3 <chr>, Q14_5_4 <chr>, Q14_5_5 <chr>, Q14_5_6 <chr>
Update
Since you only want to apply this logic to Q26
and Q14
, then you can try the code below, which allows you to change only a subset of the data.
library(dplyr)
library(tidyr)
only_at <- function(df, ..., join_by, .do) {
order <- names(df)
cols_to_change <- select(df, !!join_by, ...)
cols_to_keep <- select(df, !!join_by, !any_of(names(cols_to_change)))
left_join(.do(cols_to_change), cols_to_keep, join_by)[, order]
}
data_in %>%
only_at(ID, starts_with("Q26"), starts_with("Q14"),
join_by = "ID",
.do = . %>%
pivot_longer(-ID, names_to = c("qns", "sub1", "sub2"), names_pattern = "([^_]+)_([^_]+)_([^_]+)") %>%
group_by(ID, qns, sub1) %>%
mutate(value = if (all(is.na(value))) NA_character_ else replace_na(value, "0")) %>%
pivot_wider(names_from = c("qns", "sub1", "sub2"))
)
Test data look like this
> data_in
# A tibble: 5 x 53
ID random_column Q26_1_1 Q26_1_2 Q26_1_3 Q26_1_4 Q26_2_1 Q26_2_2 random_column2 Q26_2_3 Q26_2_4 Q26_3_1 Q26_3_2 Q26_3_3 Q26_3_4 Q26_4_1 Q26_4_2
<dbl> <chr> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <lgl> <lgl> <dbl> <dbl> <lgl>
1 1 NA NA NA NA NA NA NA two NA NA NA NA NA NA NA NA
2 2 orange NA NA NA NA NA NA one NA NA NA NA NA NA NA NA
3 3 NA 1 NA 1 NA 1 NA NA 1 NA 1 NA NA NA NA NA
4 4 apple NA NA NA 1 NA NA one NA 1 NA NA NA 1 NA NA
5 5 orange 1 NA NA NA NA 1 NA 1 NA 1 NA NA NA 1 NA
# ... with 36 more variables: Q26_4_3 <lgl>, Q26_4_4 <dbl>, Q26_5_1 <dbl>, Q26_5_2 <dbl>, Q26_5_3 <dbl>, Q26_5_4 <dbl>, Q14_1_1 <lgl>, Q14_1_2 <lgl>,
# Q14_1_3 <dbl>, Q14_1_4 <lgl>, Q14_1_5 <lgl>, Q14_1_6 <dbl>, Q14_2_1 <lgl>, Q14_2_2 <lgl>, Q14_2_3 <dbl>, Q14_2_4 <lgl>, Q14_2_5 <lgl>,
# Q14_2_6 <dbl>, Q14_3_1 <dbl>, Q14_3_2 <dbl>, Q14_3_3 <dbl>, Q14_3_4 <lgl>, Q14_3_5 <lgl>, Q14_3_6 <dbl>, Q14_4_1 <lgl>, Q14_4_2 <lgl>,
# Q14_4_3 <dbl>, Q14_4_4 <lgl>, Q14_4_5 <lgl>, Q14_4_6 <dbl>, Q14_5_1 <lgl>, Q14_5_2 <lgl>, Q14_5_3 <dbl>, Q14_5_4 <lgl>, Q14_5_5 <lgl>, Q14_5_6 <dbl>
Output
# A tibble: 5 x 53
# Groups: ID [5]
ID random_column Q26_1_1 Q26_1_2 Q26_1_3 Q26_1_4 Q26_2_1 Q26_2_2 random_column2 Q26_2_3 Q26_2_4 Q26_3_1 Q26_3_2 Q26_3_3 Q26_3_4 Q26_4_1 Q26_4_2
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 NA NA NA NA NA NA NA two NA NA NA NA NA NA NA NA
2 2 orange NA NA NA NA NA NA one NA NA NA NA NA NA NA NA
3 3 NA 1 0 1 0 1 0 NA 1 0 1 0 0 0 0 0
4 4 apple 0 0 0 1 0 0 one 0 1 0 0 0 1 0 0
5 5 orange 1 0 0 0 0 1 NA 1 0 1 0 0 0 1 0
# ... with 36 more variables: Q26_4_3 <chr>, Q26_4_4 <chr>, Q26_5_1 <chr>, Q26_5_2 <chr>, Q26_5_3 <chr>, Q26_5_4 <chr>, Q14_1_1 <chr>, Q14_1_2 <chr>,
# Q14_1_3 <chr>, Q14_1_4 <chr>, Q14_1_5 <chr>, Q14_1_6 <chr>, Q14_2_1 <chr>, Q14_2_2 <chr>, Q14_2_3 <chr>, Q14_2_4 <chr>, Q14_2_5 <chr>,
# Q14_2_6 <chr>, Q14_3_1 <chr>, Q14_3_2 <chr>, Q14_3_3 <chr>, Q14_3_4 <chr>, Q14_3_5 <chr>, Q14_3_6 <chr>, Q14_4_1 <chr>, Q14_4_2 <chr>,
# Q14_4_3 <chr>, Q14_4_4 <chr>, Q14_4_5 <chr>, Q14_4_6 <chr>, Q14_5_1 <chr>, Q14_5_2 <chr>, Q14_5_3 <chr>, Q14_5_4 <chr>, Q14_5_5 <chr>, Q14_5_6 <chr>
Test data
structure(list(ID = c(1, 2, 3, 4, 5), random_column = c(NA, "orange",
NA, "apple", "orange"), Q26_1_1 = c(NA, NA, 1, NA, 1), Q26_1_2 = c(NA,
NA, NA, NA, NA), Q26_1_3 = c(NA, NA, 1, NA, NA), Q26_1_4 = c(NA,
NA, NA, 1, NA), Q26_2_1 = c(NA, NA, 1, NA, NA), Q26_2_2 = c(NA,
NA, NA, NA, 1), random_column2 = c("two", "one", NA, "one", NA
), Q26_2_3 = c(NA, NA, 1, NA, 1), Q26_2_4 = c(NA, NA, NA, 1,
NA), Q26_3_1 = c(NA, NA, 1, NA, 1), Q26_3_2 = c(NA, NA, NA, NA,
NA), Q26_3_3 = c(NA, NA, NA, NA, NA), Q26_3_4 = c(NA, NA, NA,
1, NA), Q26_4_1 = c(NA, NA, NA, NA, 1), Q26_4_2 = c(NA, NA, NA,
NA, NA), Q26_4_3 = c(NA, NA, NA, NA, NA), Q26_4_4 = c(NA, NA,
1, 1, NA), Q26_5_1 = c(NA, NA, 1, NA, 1), Q26_5_2 = c(NA, NA,
1, NA, 1), Q26_5_3 = c(NA, NA, 1, NA, 1), Q26_5_4 = c(NA, NA,
NA, 1, NA), Q14_1_1 = c(NA, NA, NA, NA, NA), Q14_1_2 = c(NA,
NA, NA, NA, NA), Q14_1_3 = c(NA, NA, 1, NA, 1), Q14_1_4 = c(NA,
NA, NA, NA, NA), Q14_1_5 = c(NA, NA, NA, NA, NA), Q14_1_6 = c(1,
1, NA, 1, NA), Q14_2_1 = c(NA, NA, NA, NA, NA), Q14_2_2 = c(NA,
NA, NA, NA, NA), Q14_2_3 = c(NA, NA, 1, NA, 1), Q14_2_4 = c(NA,
NA, NA, NA, NA), Q14_2_5 = c(NA, NA, NA, NA, NA), Q14_2_6 = c(1,
1, NA, 1, NA), Q14_3_1 = c(1, NA, NA, NA, NA), Q14_3_2 = c(1,
NA, 1, NA, NA), Q14_3_3 = c(NA, NA, 1, NA, 1), Q14_3_4 = c(NA,
NA, NA, NA, NA), Q14_3_5 = c(NA, NA, NA, NA, NA), Q14_3_6 = c(NA,
1, NA, 1, NA), Q14_4_1 = c(NA, NA, NA, NA, NA), Q14_4_2 = c(NA,
NA, NA, NA, NA), Q14_4_3 = c(NA, NA, NA, NA, 1), Q14_4_4 = c(NA,
NA, NA, NA, NA), Q14_4_5 = c(NA, NA, NA, NA, NA), Q14_4_6 = c(1,
1, 1, 1, NA), Q14_5_1 = c(NA, NA, NA, NA, NA), Q14_5_2 = c(NA,
NA, NA, NA, NA), Q14_5_3 = c(NA, NA, 1, NA, 1), Q14_5_4 = c(NA,
NA, NA, NA, NA), Q14_5_5 = c(NA, NA, NA, NA, NA), Q14_5_6 = c(1,
1, NA, 1, NA)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
Some Explanations about !!join_by
and .do = . %>% ...
Simply speaking, !!
is a special operator used in the <tidy-select>
semantics. It forces the evaluation of join_by
. In other words, it tells the select
function that join_by
is not a variable in the dataframe. For more comprehensive explanations, see this R-blog.
.do
is an argument that takes a function as its input. Nothing special with the name. You can also call it dosomthing
or whatever_you_like
. The key here is this part:
. %>%
pivot_longer(-ID, names_to = c("qns", "sub1", "sub2"), names_pattern = "([^_]+)_([^_]+)_([^_]+)") %>%
group_by(ID, qns, sub1) %>%
mutate(value = if (all(is.na(value))) NA_character_ else replace_na(value, "0")) %>%
pivot_wider(names_from = c("qns", "sub1", "sub2"))
When a pipeline starts with a .
, the whole pipeline will be converted into a single function. So the .do = . %>% ...
is just passing a function to the .do
argument. You can try typing . %>% + 1
in R console and see what it returns. For more information, see ?`%>%`
Upvotes: 1