Reputation: 145745
I frequently need to recode some (not all!) values in a data frame column based off of a look-up table. I'm not satisfied by the ways I know of to solve the problem. I'd like to be able to do it in a clear, stable, and efficient way. Before I write my own function, I'd want to make sure I'm not duplicating something standard that's already out there.
## Toy example
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("a", "d", "!")
)
## desired result
# id x
# 1 1 a
# 2 2 a
# 3 3 B
# 4 4 C
# 5 5 d
# 6 6 AA
# 7 7 !
I can do it with a join, coalesce, unselect as below, but this isn't as clear as I'd like - too many steps.
## This works, but is more steps than I want
library(dplyr)
data %>%
left_join(lookup, by = c("x" = "old")) %>%
mutate(x = coalesce(new, x)) %>%
select(-new)
It can also be done with dplyr::recode
, as below, converting the lookup table to a named lookup vector. I prefer lookup
as a data frame, but I'm okay with the named vector solution. My concern here is that recode
is the Questioning lifecycle phase, so I'm worried that this method isn't stable.
lookup_v = pull(lookup, new) %>% setNames(lookup$old)
data %>%
mutate(x = recode(x, !!!lookup_v))
It could also be done with, say, stringr::str_replace
, but using regex for whole-string matching isn't efficient. I suppose there is forcats::fct_recode
is a stable version of recode
, but I don't want a factor
output (though mutate(x = as.character(fct_recode(x, !!!lookup_v)))
is perhaps my favorite option so far...).
I had hoped that the new-ish rows_update()
family of dplyr
functions would work, but it is strict about column names, and I don't think it can update the column it's joining on. (And it's Experimental, so doesn't yet meet my stability requirement.)
Summary of my requirements:
character
class input. Working more generally is a nice-to-have.tidyverse
packages (though I'd also be interested in seeing a data.table
solution)Upvotes: 26
Views: 2123
Reputation: 18612
Expanding the original dataset to 10M rows, 30 runs using microbenchmark gave the follow results on my computer:
Note that forcats::fct_recode
and dplyr::recode
solutions mentioned by the OP have also been included. Neither works with the updated data because the named vector that resolves to . = !
will throw an error, which is why results are tested on the original dataset.
library(data.table)
library(dplyr)
library(purrr)
library(stringr)
library(plyr)
library(forcats)
data = data.frame(
id = 1:5,
x = c("A", "A", "B", "C", "D")
)
lookup = data.frame(
old = c("A", "D"),
new = c("a", "d")
)
set.seed(1)
data <- data[sample(1:5, 1E7, replace = T),]
dt_lookup <- data.table::copy(lookup)
dplyr_coalesce <- function(){
lookupV <- setNames(lookup$new, lookup$old)
data %>%
mutate(x = coalesce(lookupV[ x ], x))
}
datatable_in <- function(){
lookupV <- setNames(lookup$new, lookup$old)
dt_data[ x %in% names(lookupV), x := lookupV[ x ] ]
}
datatable <- function(){
## If needed
# setkey(data,x)
# setkey(lookup,old)
dt_data[dt_lookup, x:=new, on =.(x=old)]
}
purrr_modify_if <- function(){
lookupV <- setNames(lookup$new, lookup$old)
data %>%
mutate(x = modify_if(x, x %in% lookup$old, ~ lookupV[.x]))
}
stringr_str_replace_all_update <- function(){
lookupV <- setNames(lookup$new, do.call(sprintf, list("^\\Q%s\\E$", lookup$old)))
data %>%
mutate(x = str_replace_all(x, lookupV))
}
base_named_vector <- function(){
lookupV <- c(with(lookup, setNames(new, old)), setNames(nm = setdiff(unique(data$x), lookup$old)))
lookupV[data$x]
}
base_ifelse <- function(){
lookupV <- setNames(lookup$new, lookup$old)
with(data, ifelse(x %in% lookup$old, lookup$new, x))
}
plyr_mapvalues <- function(){
data %>%
mutate(x = mapvalues(x, lookup$old, lookup$new, warn_missing = F))
}
base_match <- function(){
tochange <- match(data$x, lookup$old, nomatch = 0)
data$x[tochange > 0] <- lookup$new[tochange]
}
base_local_safe_lookup <- function(){
lv <- structure(lookup$new, names = lookup$old)
safe_lookup <- function(val) {
new_val <- lv[val]
unname(ifelse(is.na(new_val), val, new_val))
}
safe_lookup(data$x)
}
dplyr_recode <- function(){
lookupV <- setNames(lookup$new, lookup$old)
data %>%
mutate(x = recode(x, !!!lookupV))
}
base_for <- function(){
for (i in seq_len(nrow(lookup))) {
data$x[data$x == lookup$old[i]] = lookup$new[i]
}
}
datatable_for <- function(){
for (i in seq_len(nrow(lookup))) {
dt_data[x == lookup$old[i], x := lookup$new[i]]
}
}
forcats_fct_recode <- function(){
lookupV <- setNames(lookup$new, lookup$old)
data %>%
mutate(x = as.character(fct_recode(x, !!!lookupV)))
}
datatable_set <- function(){
tochange <- dt_data[, chmatch(x, lookup$old, nomatch = 0)]
set(dt_data, i = which(tochange > 0), j = "x", value = lookup$new[tochange])
}
library(microbenchmark)
bench <- microbenchmark(dplyr_coalesce(),
datatable(),
datatable_in(),
datatable_for(),
base_for(),
purrr_modify_if(),
stringr_str_replace_all_update(),
base_named_vector(),
base_ifelse(),
plyr_mapvalues(),
base_match(),
base_local_safe_lookup(),
dplyr_recode(),
forcats_fct_recode(),
datatable_set(),
times = 30L,
setup = {
dt_data <- data.table::copy(data)
setDT(dt_data)
setDT(dt_lookup)
})
bench$expr <- fct_rev(fct_reorder(bench$expr, bench$time, mean))
ggplot2::autoplot(bench)
Thanks to @Waldi and @nicola for advice implementing data.table
solutions in the benchmark.
Upvotes: 15
Reputation: 39647
Another base variant using setNames
making use that []
will take the first match.
data$x <- setNames(c(lookup$new, data$x), c(lookup$old, data$x))[data$x]
#dats$x <- c(setNames(lookup$new, lookup$old), setNames(data$x, data$x))[data$x] #Variant
#data$x <- (\(.) setNames(c(lookup$new, .), c(lookup$old, .))[.])(data$x) #Variant
data
# id x
#1 1 a
#2 2 a
#3 3 B
#4 4 C
#5 5 d
#6 6 AA
#7 7 !
In case data$x
is not character
.
data$x <- setNames(c(lookup$new, data$x), c(lookup$old, data$x))[as.character(data$x)]
The same but using match
data$x <- c(lookup$new, data$x)[match(data$x, c(lookup$old, data$x))]
or fmatch
:
library(fastmatch)
data$x <- c(lookup$new, data$x)[fmatch(data$x, c(lookup$old, data$x))]
In case data$x
is long and has many duplicates it might help to take only unique
.
data$x <- (\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(unique(data$x))
#Unique and not in old
(\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(setdiff(data$x, lookup$old))
#Maybe faster as setdiff uses match for the whole vector
(\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(setdiff(unique(data$x), lookup$old))
And here kit::funique
or collapse::funique
could be a used instead of unique
.
Anther way is the usage of ==
with which
and usage of rep
.
i <- lapply(lookup$old, \(s) which(s == data$x))
data$x[unlist(i)] <- rep(lookup$new, lengths(i))
When it is a character vector it could be stored as a factor
and the levels
of a factor can be renamed. (It was asked that this was not wanted but it could be very performant.)
x <- factor(data$x)
levels(x)[match(lookup$old, levels(x), nlevels(x)+1L)] <- lookup$new
x
#[1] a a B C d AA !
#Levels: ! a AA B C d
Benchmark taking data from @LMc
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("D", "A", "!")
)
set.seed(1)
data <- data[sample(nrow(data), 1E7, replace = TRUE),]
library(data.table)
library(fastmatch)
dt_data <- data.table::copy(data)
system.time(setDT(dt_data))
# User System verstrichen
# 0.001 0.000 0.001
dtK_data <- data.table::copy(data)
dtK_lookup <- data.table::copy(lookup)
system.time({
setDT(dtK_data)
setDT(dtK_lookup)
setkey(dtK_data, x)
setkey(dtK_lookup,old) })
#Note: dt uses multiple cores
# User System verstrichen
# 0.373 0.067 0.141
f_data <- data.table::copy(data)
system.time(f_data$x <- factor(f_data$x))
# User System verstrichen
# 0.238 0.032 0.270
C0_data <- data.table::copy(data)
fun <- alist(coalesceDeframe = C0_data$x <- dplyr::coalesce(tibble::deframe(lookup)[data$x], data$x),
coalesce = C0_data$x <- dplyr::coalesce(setNames(lookup$new, lookup$old)[data$x], data$x),
"recode" = C0_data$x <- dplyr::recode(data$x, !!!setNames(lookup$new, lookup$old)),
mapvalues = C0_data$x <- plyr::mapvalues(data$x, lookup$old, lookup$new, FALSE),
"unique" = (\(.) C0_data$x <- c(lookup$new, .)[match(data$x, c(lookup$old, .))])(unique(data$x)),
"match0" = {i <- match(data$x, lookup$old, nomatch = 0)
C0_data$x[i>0] <- lookup$new[i]},
"matchNA" = {i <- match(data$x, lookup$old)
j <- which(!is.na(i))
C0_data$x[j] <- lookup$new[i[j]]},
"rep" = {i <- lapply(lookup$old, \(s) which(s == data$x))
C0_data$x[unlist(i)] <- rep(lookup$new, lengths(i))},
"uniqueFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(unique(data$x)),
datatable = Cdt_data[lookup, x:=new, on=.(x=old)],
datatableKey = CdtK_data[dtK_lookup, x:=new, on=.(x=old)],
"collapseFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(collapse::funique(data$x)),
"kitFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(kit::funique(data$x)),
"factor" = levels(Cf_data$x)[match(lookup$old, levels(Cf_data$x), nlevels(Cf_data$x)+1L)] <- lookup$new
)
bench <- microbenchmark::microbenchmark(list = fun, times = 7L,
control=list(order="block"),
setup = {Cdt_data <- data.table::copy(dt_data)
CdtK_data <- data.table::copy(dtK_data)
Cf_data <- data.table::copy(f_data)
gc()
})
bench
ggplot2::autoplot(bench, log=FALSE)
Result
Unit: milliseconds
expr min lq mean median uq max neval
coalesceDeframe 1419.85752 1431.90215 1441.67163 1437.54078 1456.58562 1457.32759 7
coalesce 1416.81792 1418.58149 1422.67433 1420.02443 1424.95336 1434.80827 7
recode 1162.67033 1163.04287 1168.39498 1163.63317 1168.59404 1189.18758 7
mapvalues 613.99308 616.50627 618.56250 617.50594 617.96866 629.48865 7
match0 369.73388 379.07043 390.64808 385.51089 403.51618 414.11857 7
matchNA 360.17730 375.59077 378.24535 376.08590 385.57151 389.12971 7
rep 351.87641 352.77764 364.34904 357.27131 375.93110 383.87809 7
unique 340.98122 341.27901 341.87930 341.63321 342.35865 343.26537 7
uniqueFmatch 253.75138 253.90585 254.33802 254.14054 254.73900 255.18455 7
datatable 237.79038 252.08057 251.22063 253.78246 254.10755 254.59535 7
datatableKey 165.42183 165.55716 166.01792 165.68011 166.34137 167.22646 7
collapseFmatch 125.93940 126.10227 126.25538 126.13199 126.42614 126.65943 7
kitFmatch 125.72638 125.94671 127.43846 126.16227 126.96446 134.35822 7
factor 39.35195 39.43222 39.45809 39.44678 39.47576 39.59197 7
In this case using a factor
and updating it's levels
is fastest. Using a fast version of unique
improves speed substantially.
Upvotes: 1
Reputation: 41210
A direct data.table
solution, without %in%
.
Depending on the length of the lookup / data tables, adding keys could improve performance substantially, but this isn't the case on this simple example.
library(data.table)
setDT(data)
setDT(lookup)
## If needed
# setkey(data,x)
# setkey(lookup,old)
data[lookup, x:=new, on=.(x=old)]
data
id x
1: 1 a
2: 2 a
3: 3 B
4: 4 C
5: 5 d
6: 6 AA
7: 7 !
Upvotes: 15
Reputation: 18541
I basically share the same problem. Although dplyr::recode
is in the "questioning" life cycle I don't expect it to become deprecated. At some point it might be superseded, but even in this case it should still be usable. Therefore I'm using a wrapper around dplyr::recode
which allows the use of named vectors and or two vectors (which could be a lookup table).
library(dplyr)
library(rlang)
recode2 <- function(x, new, old = NULL, .default = NULL, .missing = NULL) {
if (!rlang::is_named(new) && !is.null(old)) {
new <- setNames(new, old)
}
do.call(dplyr::recode,
c(.x = list(x),
.default = list(.default),
.missing = list(.missing),
as.list(new)))
}
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("a", "d", "!")
)
# two vectors new / old
data %>%
mutate(x = recode2(x, lookup$new, lookup$old))
#> id x
#> 1 1 a
#> 2 2 a
#> 3 3 B
#> 4 4 C
#> 5 5 d
#> 6 6 AA
#> 7 7 !
# named vector
data %>%
mutate(x = recode2(x, c("A" = "a",
"D" = "d",
"." = "!")))
#> id x
#> 1 1 a
#> 2 2 a
#> 3 3 B
#> 4 4 C
#> 5 5 d
#> 6 6 AA
#> 7 7 !
Created on 2021-04-21 by the reprex package (v0.3.0)
Upvotes: 4
Reputation: 18612
You could use purrr::modify_if
to only apply the named vector to values that exist in it. Though not a specified requirement, it has the benefit of the .else
argument, which allows you to apply a different function to values not in your lookup.
I also wanted to include the use of tibble::deframe
here to create the named vector. It is slower than setNames
, though.
lookupV <- deframe(lookup)
data %>%
mutate(x = modify_if(x, x %in% lookup$old, ~ lookupV[.x]))
Alternatively, you could use stringr::str_replace_all
, which can take a named vector for the replacement
argument.
data %>%
mutate(x = str_replace_all(x, lookupV))
Update
To accommodate the change to your edited example, the named vector used in str_replace_all
needs to be modified. In this way, the entire literal string needs to be match so that "A" does not get substituted in "AA", or "." does not replace everything:
lookupV <- setNames(lookup$new, do.call(sprintf, list("^\\Q%s\\E$", lookup$old)))
data %>%
mutate(x = str_replace_all(x, lookupV))
Using dplyr::left_join
this is very similar to OP solution, but uses .keep
argument of mutate
so it has less steps. This argument is currently in the experimental lifecycle and so it is not included in the benchmark (though it is around the middle of posted solutions).
left_join(data, lookup, by = c("x" = "old")) %>%
mutate(x = coalesce(new, x) , .keep = "unused")
R
Create a substitution value for every unique value in your dataframe.
lookupV <- c(with(lookup, setNames(new, old)), setNames(nm = setdiff(unique(data$x), lookup$old)))
data$x <- lookupV[data$x]
with(data, ifelse(x %in% lookup$old, lookupV[x], x))
Upvotes: 7
Reputation: 26333
A base R
option using %in%
andmatch
- thanks to @LMc & @nicola
tochange <- match(data$x, lookup$old, nomatch = 0)
data$x[tochange > 0] <- lookup$new[tochange]
One more data.table
option using set()
and chmatch
library(data.table)
setDT(data)
tochange <- data[, chmatch(x, lookup$old, nomatch = 0)]
set(data, i = which(tochange > 0), j = "x", value = lookup$new[tochange])
Result
data
# id x
#1 1 a
#2 2 a
#3 3 B
#4 4 C
#5 5 d
#6 6 AA
#7 7 !
Upvotes: 8
Reputation: 5138
Another option that is clear is to use a for
-loop with subsetting to loop through the rows of the lookup
table. This will almost always be quicker with data.table
because of auto indexing, or if you set the key (i.e., ?data.table::setkey()
) ahead of time. Also, it will--of course--get slower as the lookup table gets longer. I would guess an update-join would be preferred if there is a long lookup table.
Base R:
for (i in seq_len(nrow(lookup))) {
data$x[data$x == lookup$old[i]] <- lookup$new[i]
}
data$x
# [1] "a" "a" "B" "C" "d" "AA" "!"
Or the same logic with data.table
:
library(data.table)
setDT(data)
for (i in seq_len(nrow(lookup))) {
data[x == lookup$old[i], x := lookup$new[i]]
}
data$x
# [1] "a" "a" "B" "C" "d" "AA" "!"
Data:
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("a", "d", "!")
)
Upvotes: 6
Reputation: 593
dplyr+plyr solution that is in order with all ur bulletpoints (if u consider plyr in the the tidyverse):
data <- data %>%
dplyr::mutate(
x = plyr::mapvalues(x, lookup$old, lookup$new) #Can add , F to remove warnings
)
Upvotes: 4
Reputation: 3412
Another base solution, with a lookup vector:
## Toy example
data = data.frame(
id = 1:5,
x = c("A", "A", "B", "C", "D"),
stringsAsFactors = F
)
lookup = data.frame(
old = c("A", "D"),
new = c("a", "d"),
stringsAsFactors = F
)
lv <- structure(lookup$new, names = lookup$old)
safe_lookup <- function(val) {
new_val <- lv[val]
unname(ifelse(is.na(new_val), val, new_val))
}
data$x <- safe_lookup(data$x)
Upvotes: 4
Reputation: 56004
Combination of a named vector and coalesce:
# make lookup vector
lookupV <- setNames(lookup$new, lookup$old)
data %>%
mutate(x = coalesce(lookupV[ x ], x))
# id x
# 1 1 a
# 2 2 a
# 3 3 B
# 4 4 C
# 5 5 d
Or data.table:
library(data.table)
setDT(data)
data[ x %in% names(lookupV), x := lookupV[ x ] ]
This post might have a better solution for data.table - "update on merge":
Upvotes: 9