Gregor Thomas
Gregor Thomas

Reputation: 145745

Canonical tidyverse method to update some values of a vector from a look-up table

I frequently need to recode some (not all!) values in a data frame column based off of a look-up table. I'm not satisfied by the ways I know of to solve the problem. I'd like to be able to do it in a clear, stable, and efficient way. Before I write my own function, I'd want to make sure I'm not duplicating something standard that's already out there.

## Toy example
data = data.frame(
  id = 1:7,
  x = c("A", "A", "B", "C", "D", "AA", ".")
)

lookup = data.frame(
  old = c("A", "D", "."),
  new = c("a", "d", "!")
)

## desired result
#   id  x
# 1  1  a
# 2  2  a
# 3  3  B
# 4  4  C
# 5  5  d
# 6  6 AA
# 7  7  !

I can do it with a join, coalesce, unselect as below, but this isn't as clear as I'd like - too many steps.

## This works, but is more steps than I want
library(dplyr)
data %>%
  left_join(lookup, by = c("x" = "old")) %>%
  mutate(x = coalesce(new, x)) %>%
  select(-new)

It can also be done with dplyr::recode, as below, converting the lookup table to a named lookup vector. I prefer lookup as a data frame, but I'm okay with the named vector solution. My concern here is that recode is the Questioning lifecycle phase, so I'm worried that this method isn't stable.

lookup_v = pull(lookup, new) %>% setNames(lookup$old)
data %>%
  mutate(x = recode(x, !!!lookup_v))

It could also be done with, say, stringr::str_replace, but using regex for whole-string matching isn't efficient. I suppose there is forcats::fct_recode is a stable version of recode, but I don't want a factor output (though mutate(x = as.character(fct_recode(x, !!!lookup_v))) is perhaps my favorite option so far...).

I had hoped that the new-ish rows_update() family of dplyr functions would work, but it is strict about column names, and I don't think it can update the column it's joining on. (And it's Experimental, so doesn't yet meet my stability requirement.)

Summary of my requirements:

Upvotes: 26

Views: 2123

Answers (10)

LMc
LMc

Reputation: 18612

Benchmarking

Expanding the original dataset to 10M rows, 30 runs using microbenchmark gave the follow results on my computer:

Note that forcats::fct_recode and dplyr::recode solutions mentioned by the OP have also been included. Neither works with the updated data because the named vector that resolves to . = ! will throw an error, which is why results are tested on the original dataset.

library(data.table)
library(dplyr)
library(purrr)
library(stringr)
library(plyr)
library(forcats)

data = data.frame(
  id = 1:5,
  x = c("A", "A", "B", "C", "D")
)

lookup = data.frame(
  old = c("A", "D"),
  new = c("a", "d")
)

set.seed(1)
data <- data[sample(1:5, 1E7, replace = T),]

dt_lookup <- data.table::copy(lookup)

dplyr_coalesce <- function(){
  lookupV <- setNames(lookup$new, lookup$old)
  data %>% 
    mutate(x = coalesce(lookupV[ x ], x))
}

datatable_in <- function(){
  lookupV <- setNames(lookup$new, lookup$old)
  dt_data[ x %in% names(lookupV), x := lookupV[ x ] ]
}

datatable <- function(){
  ## If needed
  # setkey(data,x)
  # setkey(lookup,old)
  
  dt_data[dt_lookup, x:=new, on =.(x=old)]
}

purrr_modify_if <- function(){
  lookupV <- setNames(lookup$new, lookup$old)
  data %>% 
    mutate(x = modify_if(x, x %in% lookup$old, ~ lookupV[.x]))
}

stringr_str_replace_all_update <- function(){
  lookupV <- setNames(lookup$new, do.call(sprintf, list("^\\Q%s\\E$", lookup$old)))
  
  data %>% 
    mutate(x = str_replace_all(x, lookupV))
}

base_named_vector <- function(){
  lookupV <- c(with(lookup, setNames(new, old)), setNames(nm = setdiff(unique(data$x), lookup$old)))
  lookupV[data$x]
}

base_ifelse <- function(){
  lookupV <- setNames(lookup$new, lookup$old)
  with(data, ifelse(x %in% lookup$old, lookup$new, x))
}

plyr_mapvalues <- function(){
  data %>% 
    mutate(x = mapvalues(x, lookup$old, lookup$new, warn_missing = F))
}

base_match <- function(){
  tochange <- match(data$x, lookup$old, nomatch = 0)
  data$x[tochange > 0] <- lookup$new[tochange]
}

base_local_safe_lookup <- function(){
  lv <- structure(lookup$new, names = lookup$old)
  
  safe_lookup <- function(val) {
    new_val <- lv[val]
    unname(ifelse(is.na(new_val), val, new_val))
  }
  
  safe_lookup(data$x)
}

dplyr_recode <- function(){
  lookupV <- setNames(lookup$new, lookup$old)
  
  data %>%
    mutate(x = recode(x, !!!lookupV))
}

base_for <- function(){
  for (i in seq_len(nrow(lookup))) {
    data$x[data$x == lookup$old[i]] = lookup$new[i]
  }
}

datatable_for <- function(){
  for (i in seq_len(nrow(lookup))) {
    dt_data[x == lookup$old[i], x := lookup$new[i]]
  }
}

forcats_fct_recode <- function(){
  lookupV <- setNames(lookup$new, lookup$old)
  
  data %>% 
    mutate(x = as.character(fct_recode(x, !!!lookupV)))
  
}

datatable_set <- function(){
  tochange <- dt_data[, chmatch(x, lookup$old, nomatch = 0)]
  set(dt_data, i = which(tochange > 0), j = "x", value = lookup$new[tochange])
}

library(microbenchmark)
bench <- microbenchmark(dplyr_coalesce(),
                        datatable(),
                        datatable_in(),
                        datatable_for(),
                        base_for(),
                        purrr_modify_if(),
                        stringr_str_replace_all_update(),
                        base_named_vector(),
                        base_ifelse(),
                        plyr_mapvalues(),
                        base_match(),
                        base_local_safe_lookup(),
                        dplyr_recode(),
                        forcats_fct_recode(),
                        datatable_set(),
                        times = 30L,
                        setup = {
                          dt_data <- data.table::copy(data)
                          setDT(dt_data)
                          setDT(dt_lookup)
                          })

bench$expr <- fct_rev(fct_reorder(bench$expr, bench$time, mean))
ggplot2::autoplot(bench)

enter image description here

Thanks to @Waldi and @nicola for advice implementing data.table solutions in the benchmark.

Upvotes: 15

GKi
GKi

Reputation: 39647

Another base variant using setNames making use that [] will take the first match.

data$x <- setNames(c(lookup$new, data$x), c(lookup$old, data$x))[data$x]

#dats$x <- c(setNames(lookup$new, lookup$old), setNames(data$x, data$x))[data$x] #Variant
#data$x <- (\(.) setNames(c(lookup$new, .), c(lookup$old, .))[.])(data$x) #Variant

data
#  id  x
#1  1  a
#2  2  a
#3  3  B
#4  4  C
#5  5  d
#6  6 AA
#7  7  !

In case data$x is not character.

data$x <- setNames(c(lookup$new, data$x), c(lookup$old, data$x))[as.character(data$x)]

The same but using match

data$x <- c(lookup$new, data$x)[match(data$x, c(lookup$old, data$x))]

or fmatch:

library(fastmatch)
data$x <- c(lookup$new, data$x)[fmatch(data$x, c(lookup$old, data$x))]

In case data$x is long and has many duplicates it might help to take only unique.

data$x <- (\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(unique(data$x))

#Unique and not in old
(\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(setdiff(data$x, lookup$old))

#Maybe faster as setdiff uses match for the whole vector
(\(.) c(lookup$new, .)[match(data$x, c(lookup$old, .))])(setdiff(unique(data$x), lookup$old))

And here kit::funique or collapse::funique could be a used instead of unique.


Anther way is the usage of == with which and usage of rep.

i <- lapply(lookup$old, \(s) which(s == data$x))
data$x[unlist(i)] <- rep(lookup$new, lengths(i))

When it is a character vector it could be stored as a factor and the levels of a factor can be renamed. (It was asked that this was not wanted but it could be very performant.)

x <- factor(data$x)

levels(x)[match(lookup$old, levels(x), nlevels(x)+1L)] <- lookup$new

x
#[1] a  a  B  C  d  AA ! 
#Levels: ! a AA B C d

Benchmark taking data from @LMc

data = data.frame(
  id = 1:7,
  x = c("A", "A", "B", "C", "D", "AA", ".")
)

lookup = data.frame(
  old = c("A", "D", "."),
  new = c("D", "A", "!")
)

set.seed(1)
data <- data[sample(nrow(data), 1E7, replace = TRUE),]

library(data.table)
library(fastmatch)

dt_data <- data.table::copy(data)
system.time(setDT(dt_data))
#       User      System verstrichen 
#      0.001       0.000       0.001 

dtK_data <- data.table::copy(data)
dtK_lookup <- data.table::copy(lookup)
system.time({
setDT(dtK_data)
setDT(dtK_lookup)
setkey(dtK_data, x)
setkey(dtK_lookup,old) })
#Note: dt uses multiple cores
#       User      System verstrichen 
#      0.373       0.067       0.141 

f_data <- data.table::copy(data)
system.time(f_data$x <- factor(f_data$x))
#       User      System verstrichen 
#      0.238       0.032       0.270 

C0_data <- data.table::copy(data)

fun <- alist(coalesceDeframe = C0_data$x <- dplyr::coalesce(tibble::deframe(lookup)[data$x], data$x),
coalesce = C0_data$x <- dplyr::coalesce(setNames(lookup$new, lookup$old)[data$x], data$x),
"recode" = C0_data$x <- dplyr::recode(data$x, !!!setNames(lookup$new, lookup$old)),
mapvalues = C0_data$x <- plyr::mapvalues(data$x, lookup$old, lookup$new, FALSE),
"unique" = (\(.) C0_data$x <- c(lookup$new, .)[match(data$x, c(lookup$old, .))])(unique(data$x)),
"match0" = {i <- match(data$x, lookup$old, nomatch = 0)
  C0_data$x[i>0] <- lookup$new[i]},
"matchNA" = {i <- match(data$x, lookup$old)
  j <- which(!is.na(i))
  C0_data$x[j] <- lookup$new[i[j]]},
"rep" = {i <- lapply(lookup$old, \(s) which(s == data$x))
C0_data$x[unlist(i)] <- rep(lookup$new, lengths(i))},
"uniqueFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(unique(data$x)),
datatable = Cdt_data[lookup, x:=new, on=.(x=old)],
datatableKey = CdtK_data[dtK_lookup, x:=new, on=.(x=old)],
"collapseFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(collapse::funique(data$x)),
"kitFmatch" = (\(.) C0_data$x <- c(lookup$new, .)[fmatch(data$x, c(lookup$old, .))])(kit::funique(data$x)),
"factor" = levels(Cf_data$x)[match(lookup$old, levels(Cf_data$x), nlevels(Cf_data$x)+1L)] <- lookup$new
)

bench <- microbenchmark::microbenchmark(list = fun, times = 7L,
    control=list(order="block"),
    setup = {Cdt_data <- data.table::copy(dt_data)
      CdtK_data <- data.table::copy(dtK_data)
      Cf_data <- data.table::copy(f_data)
      gc()
    })
bench
ggplot2::autoplot(bench, log=FALSE)

Result

Unit: milliseconds
            expr        min         lq       mean     median         uq        max neval
 coalesceDeframe 1419.85752 1431.90215 1441.67163 1437.54078 1456.58562 1457.32759     7
        coalesce 1416.81792 1418.58149 1422.67433 1420.02443 1424.95336 1434.80827     7
          recode 1162.67033 1163.04287 1168.39498 1163.63317 1168.59404 1189.18758     7
       mapvalues  613.99308  616.50627  618.56250  617.50594  617.96866  629.48865     7
          match0  369.73388  379.07043  390.64808  385.51089  403.51618  414.11857     7
         matchNA  360.17730  375.59077  378.24535  376.08590  385.57151  389.12971     7
             rep  351.87641  352.77764  364.34904  357.27131  375.93110  383.87809     7
          unique  340.98122  341.27901  341.87930  341.63321  342.35865  343.26537     7
    uniqueFmatch  253.75138  253.90585  254.33802  254.14054  254.73900  255.18455     7
       datatable  237.79038  252.08057  251.22063  253.78246  254.10755  254.59535     7
    datatableKey  165.42183  165.55716  166.01792  165.68011  166.34137  167.22646     7
  collapseFmatch  125.93940  126.10227  126.25538  126.13199  126.42614  126.65943     7
       kitFmatch  125.72638  125.94671  127.43846  126.16227  126.96446  134.35822     7
          factor   39.35195   39.43222   39.45809   39.44678   39.47576   39.59197     7

Timings of different methods

In this case using a factor and updating it's levels is fastest. Using a fast version of unique improves speed substantially.

Upvotes: 1

Waldi
Waldi

Reputation: 41210

A direct data.table solution, without %in%.
Depending on the length of the lookup / data tables, adding keys could improve performance substantially, but this isn't the case on this simple example.

library(data.table)

setDT(data)
setDT(lookup)

## If needed
# setkey(data,x)
# setkey(lookup,old)

data[lookup, x:=new, on=.(x=old)]
data 

   id  x
1:  1  a
2:  2  a
3:  3  B
4:  4  C
5:  5  d
6:  6 AA
7:  7  !

Upvotes: 15

TimTeaFan
TimTeaFan

Reputation: 18541

I basically share the same problem. Although dplyr::recode is in the "questioning" life cycle I don't expect it to become deprecated. At some point it might be superseded, but even in this case it should still be usable. Therefore I'm using a wrapper around dplyr::recode which allows the use of named vectors and or two vectors (which could be a lookup table).

library(dplyr)
library(rlang)

recode2 <- function(x, new, old = NULL, .default = NULL, .missing = NULL) {
  
  if (!rlang::is_named(new) && !is.null(old)) {
    new <- setNames(new, old)
  }
  
  do.call(dplyr::recode,
          c(.x = list(x),
            .default = list(.default),
            .missing = list(.missing),
            as.list(new)))
  
}

data = data.frame(
  id = 1:7,
  x = c("A", "A", "B", "C", "D", "AA", ".")
)

lookup = data.frame(
  old = c("A", "D", "."),
  new = c("a", "d", "!")
)

# two vectors new / old
data %>% 
  mutate(x = recode2(x, lookup$new, lookup$old))
#>   id  x
#> 1  1  a
#> 2  2  a
#> 3  3  B
#> 4  4  C
#> 5  5  d
#> 6  6 AA
#> 7  7  !

# named vector
data %>% 
  mutate(x = recode2(x, c("A" = "a",
                          "D" = "d",
                          "." = "!")))
#>   id  x
#> 1  1  a
#> 2  2  a
#> 3  3  B
#> 4  4  C
#> 5  5  d
#> 6  6 AA
#> 7  7  !

Created on 2021-04-21 by the reprex package (v0.3.0)

Upvotes: 4

LMc
LMc

Reputation: 18612

modify_if

You could use purrr::modify_if to only apply the named vector to values that exist in it. Though not a specified requirement, it has the benefit of the .else argument, which allows you to apply a different function to values not in your lookup.

I also wanted to include the use of tibble::deframe here to create the named vector. It is slower than setNames, though.

lookupV <- deframe(lookup)

data %>% 
  mutate(x = modify_if(x, x %in% lookup$old, ~ lookupV[.x]))

str_replace_all

Alternatively, you could use stringr::str_replace_all, which can take a named vector for the replacement argument.

data %>% 
  mutate(x = str_replace_all(x, lookupV))

Update

To accommodate the change to your edited example, the named vector used in str_replace_all needs to be modified. In this way, the entire literal string needs to be match so that "A" does not get substituted in "AA", or "." does not replace everything:

lookupV <- setNames(lookup$new, do.call(sprintf, list("^\\Q%s\\E$", lookup$old)))

data %>% 
  mutate(x = str_replace_all(x, lookupV))

left_join

Using dplyr::left_join this is very similar to OP solution, but uses .keep argument of mutate so it has less steps. This argument is currently in the experimental lifecycle and so it is not included in the benchmark (though it is around the middle of posted solutions).

left_join(data, lookup, by = c("x" = "old")) %>% 
    mutate(x = coalesce(new, x) , .keep = "unused")

Base R

Named Vector

Create a substitution value for every unique value in your dataframe.

lookupV <- c(with(lookup, setNames(new, old)), setNames(nm = setdiff(unique(data$x), lookup$old)))

data$x <- lookupV[data$x]

ifelse

with(data, ifelse(x %in% lookup$old, lookupV[x], x))

Upvotes: 7

markus
markus

Reputation: 26333

A base R option using %in% and match - thanks to @LMc & @nicola

tochange <- match(data$x, lookup$old, nomatch = 0)
data$x[tochange > 0] <- lookup$new[tochange]

One more data.table option using set() and chmatch

library(data.table)
setDT(data)

tochange <- data[, chmatch(x, lookup$old, nomatch = 0)]
set(data, i = which(tochange > 0), j = "x", value = lookup$new[tochange])

Result

data
#  id  x
#1  1  a
#2  2  a
#3  3  B
#4  4  C
#5  5  d
#6  6 AA
#7  7  !

Upvotes: 8

Andrew
Andrew

Reputation: 5138

Another option that is clear is to use a for-loop with subsetting to loop through the rows of the lookup table. This will almost always be quicker with data.table because of auto indexing, or if you set the key (i.e., ?data.table::setkey()) ahead of time. Also, it will--of course--get slower as the lookup table gets longer. I would guess an update-join would be preferred if there is a long lookup table.

Base R:

for (i in seq_len(nrow(lookup))) {
  data$x[data$x == lookup$old[i]] <- lookup$new[i]
}

data$x
# [1] "a"  "a"  "B"  "C"  "d"  "AA" "!" 

Or the same logic with data.table:

library(data.table)
setDT(data)

for (i in seq_len(nrow(lookup))) {
  data[x == lookup$old[i], x := lookup$new[i]]
}

data$x
# [1] "a"  "a"  "B"  "C"  "d"  "AA" "!" 

Data:

data = data.frame(
  id = 1:7,
  x = c("A", "A", "B", "C", "D", "AA", ".")
)

lookup = data.frame(
  old = c("A", "D", "."),
  new = c("a", "d", "!")
)

Upvotes: 6

Baraliuh
Baraliuh

Reputation: 593

dplyr+plyr solution that is in order with all ur bulletpoints (if u consider plyr in the the tidyverse):

data <- data %>% 
  dplyr::mutate(
    x = plyr::mapvalues(x, lookup$old, lookup$new) #Can add , F to remove warnings
  )

Upvotes: 4

SmokeyShakers
SmokeyShakers

Reputation: 3412

Another base solution, with a lookup vector:

## Toy example
data = data.frame(
  id = 1:5,
  x = c("A", "A", "B", "C", "D"),
  stringsAsFactors = F
)

lookup = data.frame(
  old = c("A", "D"),
  new = c("a", "d"),
  stringsAsFactors = F
)

lv <- structure(lookup$new, names = lookup$old)

safe_lookup <- function(val) {
  new_val <- lv[val]
  unname(ifelse(is.na(new_val), val, new_val))
}

data$x <- safe_lookup(data$x)

Upvotes: 4

zx8754
zx8754

Reputation: 56004

Combination of a named vector and coalesce:

# make lookup vector
lookupV <- setNames(lookup$new, lookup$old)

data %>% 
  mutate(x = coalesce(lookupV[ x ], x))
#   id x
# 1  1 a
# 2  2 a
# 3  3 B
# 4  4 C
# 5  5 d

Or data.table:

library(data.table)

setDT(data)
data[ x %in% names(lookupV), x := lookupV[ x ] ]

This post might have a better solution for data.table - "update on merge":

Upvotes: 9

Related Questions