Reputation: 143
I am working with some survey data where some of the responses have been coded inconsistently. For instance' "I don't know" may be coded as 4, 5, 97, or 777. I want to standardize all of these responses as "77" as efficiently as possible. I would like to use a crosswalk and want to avoid creating a new recode command for every variable in which this occurs, if possible, since there are several.
library(tidyverse)
#df with the inconsistent fourth category
var1 <- c("1", "2", "3", "4")
var2 <- c("1", "2", "3", "5")
var3 <- c("1", "2", "3", "97")
var4 <- c("1", "2", "3", "777")
df <- data.frame(var1, var2, var3, var4)
var <- c("var1", "var2", "var3", "var4")
oldvalue <- c("4", "6", "97", "777")
newvalue <- c("77", "77", "77", "77")
#crosswalk of old values to new values
cw <- data.frame(var, oldvalue, newvalue)
recodevars = cw$var
A few things I have tried are as follows, although I haven't had any luck with any. Please let me know if you have any suggestions.
rec_all = df %>%
transmute_at(vars(recodevars), funs(recode(., cw$oldvalue = cw$newvalue)))
for(i in recodevars){
rec_all = df %>%
transmute_at(vars(recodevars), funs(ifelse(i == cw$oldval, cw$newval, i)))
}
Upvotes: 1
Views: 798
Reputation: 5798
# Split-Apply-Combine:
do.call(
cbind,
Map(
function(x, y){
replace(
x,
x == y$oldvalue,
y$newvalue
)
},
split.default(
df,
seq_len(
ncol(
df
)
)
),
split(
cw,
seq_len(
nrow(
cw
)
)
)
)
)
Upvotes: 0
Reputation: 389235
Using purrr::pmap_dfc
-
purrr::pmap_dfc(cw, ~{
tmp <- df[..1]
tmp[tmp == ..2] <- ..3
tmp
})
# var1 var2 var3 var4
#1 1 1 1 1
#2 2 2 2 2
#3 3 3 3 3
#4 77 5 77 77
Upvotes: 0
Reputation: 102625
A base R option
setNames(list2DF(
lapply(
names(df),
function(k) {
with(
cw,
replace(df[[k]], df[[k]] == oldvalue[var == k], newvalue[var == k])
)
}
)
), names(df))
gives
var1 var2 var3 var4
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 77 5 77 77
Upvotes: 2
Reputation: 1959
If you're dealing with factors ("1") and not numeric responses (1), the forcats package may be your friend.
This will recode anything not in the notForRecode string as "77"
notForRecode <- c("1", "2", "3") # Add others as wanted
df <- df %>%
mutate(across(everything(), ~ fct_other(.x, keep = notForRecode, other_level = "77")))
Upvotes: 3
Reputation: 4427
Though not part of the tidyverse apply
is the old standard function to apply a function to any value in a data.frame, essentially making this a one liner:
var1 <- c("1", "2", "3", "4")
var2 <- c("1", "2", "3", "5")
var3 <- c("1", "2", "3", "97")
var4 <- c("1", "2", "3", "777")
df <- data.frame(var1, var2, var3, var4)
var <- c("var1", "var2", "var3", "var4")
oldvalue <- c("4", "6", "97", "777")
new.df <- apply(df[, var], 1:2, function(x) if(x %in% oldvalue) 77 else x)
print(new.df)
Upvotes: 4
Reputation: 4344
if the values are independed of the column like in your example, then you can simply use this code:
df %>%
dplyr::mutate(across(everything(), ~ ifelse(.x %in% c(4,6,97,777), 77, .x)))
Upvotes: 4
Reputation: 887831
Loop across
the 'recodevars' columns, subset the 'cw' dataset rows based on the column name (cur_column()
) while removing the first column, use that to match and replace the values of the data, and coalesce
with the original column so that the NA
elements for non-matches are replaced with corresponding original column values
library(dplyr)
library(tibble)
df %>%
mutate(across(all_of(recodevars),
~ coalesce(deframe(cw[cw$var ==cur_column(),][-1])[as.character(.)], .)))
-output
var1 var2 var3 var4
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 77 5 77 77
Upvotes: 5