Reputation: 1460
I have a below dataframe with numbers in two of the columns and I should replace that with string using my other reference dataset.
Dataset 1:
lhs rhs
32,39,6 65
39,6,65 32
14,16,26 15
16,20,4 26
16,26,33 4
53 31
Dataset 2:
id name
4 yougurt
6 coffee
14 cream chese
15 meat spreads
16 butter
20 whole milk
26 condensed milk
31 curd
32 flour
39 rolls
53 sugar
65 soda
Expected output:
lhs rhs
flour, rolls, coffee soda
rolls, coffee, soda flour
cream chease, butter, condensed milk meat spreads
Upvotes: 2
Views: 142
Reputation: 2652
Another option. Here d1
is your first data frame and d2
your second.
library(tidyverse)
d1 %>% separate(lhs, sep = ',', into = c('v1', 'v2', 'v3')) %>%
mutate_all(as.numeric) %>%
left_join(d2, by = c('v1'='id')) %>%
left_join(d2, by = c('v2'='id')) %>%
left_join(d2, by = c('v3'='id')) %>%
left_join(d2, by = c('rhs'='id')) %>%
unite(lhs, name.x, name.y, name.x.x, sep = ',') %>%
mutate(lhs = str_replace_all(lhs, ',NA', '')) %>%
select(lhs, rhs = name.y.y)
OR, as pointed out by @Moody_Mudskipper in the comments
d1 %>% separate(lhs, sep = ',', into = c('v1', 'v2', 'v3')) %>%
mutate_all(as.numeric) %>%
lmap(~setNames(left_join(setNames(.x, "id"), d2)[2], names(.x))) %>%
unite(lhs, v1, v2, v3, sep = ', ') %>%
mutate(lhs = str_replace_all(lhs, ',NA', '')) %>%
select(lhs, rhs = name.y.y)
lhs rhs
1 flour, rolls, coffee soda
2 rolls, coffee, soda flour
3 cream chese, butter, condensed milk meat spreads
4 butter, whole milk, yougurt condensed milk
5 butter, condensed milk yougurt
6 sugar curd
Upvotes: 3
Reputation: 47300
Not so idiomatic but I win the code golf :) :
as.data.frame(lapply(dat1, function(x){
for (i in seq(nrow(dat2))) x <- gsub(paste0("(^|,)",dat2$id[i],"(,|$)"),
paste0("\\1",dat2$name[i],"\\2"),x)
x}))
# lhs rhs
# 1 flour,rolls,coffee soda
# 2 rolls,coffee,soda flour
# 3 cream chese,butter,condensed milk meat spreads
# 4 butter,whole milk,yougurt condensed milk
# 5 butter,condensed milk,33 yougurt
# 6 sugar curd
May fail if you have numbers in 2nd dataset.
Upvotes: 0
Reputation: 28675
This is almost the same as www, but appears to be a little faster. Apparently using strsplit
and unnest
is faster than separate_rows
require(tidyverse)
df1 %>%
mutate(lhs = sapply(lhs, strsplit, ',')) %>%
unnest %>%
mutate_at(c('lhs', 'rhs'), as.numeric) %>%
left_join(df2, by = c('lhs'= 'id')) %>%
left_join(df2, by = c('rhs'= 'id')) %>%
group_by(name.y) %>%
summarize(name.x = paste(name.x, collapse = ', ')) %>%
rename(rhs = name.y, lhs = name.x)
Then there's the data.table
solution, which is much faster.
require(data.table)
setDT(df1)
df1[, .(lhs = unlist(strsplit(lhs, ','))), rhs] %>%
.[, lapply(.SD, as.numeric)] %>%
merge(df2, by.x = 'lhs', by.y = 'id') %>%
merge(df2, by.x = 'rhs', by.y = 'id') %>%
.[, .(lhs = paste0(name.x, collapse = ',')), by = .(rhs = name.y)]
Benchmark
# Results
# Unit: relative
# expr min lq mean median uq max neval
# useDT() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 300
# UseUnnest() 5.570704 5.632532 5.274552 5.374714 5.042518 9.254190 300
# UseSeparateRows() 8.640615 8.356889 7.661669 7.939593 7.401666 7.896038 300
# Method
require(tidyverse)
require(data.table)
df1 <- fread("
lhs rhs
32,39,6 65
39,6,65 32
14,16,26 15
16,20,4 26
16,26,33 4
53 31
")
df2 <- fread("
id name
4 yougurt
6 coffee
14 cream_chese
15 meat_spreads
16 butter
20 whole_milk
26 condensed_milk
31 curd
32 flour
39 rolls
53 sugar
65 soda
")
useDT <- function(x){
df1[, lapply(sapply(lhs, strsplit, ','), unlist), rhs] %>%
setNames(c('rhs', 'lhs')) %>%
.[, `:=`(lhs = as.numeric(lhs),
rhs = as.numeric(rhs))] %>%
.[df2, on = c('lhs'= 'id')] %>%
.[df2, on = c('rhs'= 'id')] %>%
.[, .(lhs = paste0(name, collapse = ',')), by = i.name] %>%
.[lhs != 'NA', .(lhs, rhs = i.name)]
}
UseUnnest <- function(x){
df1 %>%
mutate(lhs = sapply(lhs, strsplit, ',')) %>%
unnest %>%
mutate_at(c('lhs', 'rhs'), as.numeric) %>%
left_join(df2, by = c('lhs'= 'id')) %>%
left_join(df2, by = c('rhs'= 'id')) %>%
group_by(name.y) %>%
summarize(name.x = paste(name.x, collapse = ', ')) %>%
rename(rhs = name.y, lhs = name.x)
}
UseSeparateRows <- function(x){
df1 %>%
separate_rows(lhs, convert = TRUE) %>%
left_join(df2, by = c("lhs" = "id")) %>%
left_join(df2, by = c("rhs" = "id")) %>%
drop_na(name.x) %>%
group_by(name.y) %>%
summarise(lhs = paste0(name.x, collapse = ", ")) %>%
ungroup() %>%
select(lhs, rhs = name.y)
}
microbenchmark(useDT(), UseUnnest(), UseSeparateRows(), times = 300, unit = 'relative')
Upvotes: 3
Reputation: 24079
Here is an option using just base R and mapping the numeric values to factor labels.
Split the string, map the labels to the values and then collapse the labels back into a string.
df<-structure(list(id = c(4L, 6L, 14L, 15L, 16L, 20L, 26L, 31L, 32L,
39L, 53L, 65L), name = c("yougurt", "coffee", "cream cheese",
"meat spreads", "butter", "whole milk", "condensed milk", "curd",
"flour", "rolls", "sugar", "soda")), .Names = c("id", "name"),
class = "data.frame", row.names = c(NA, -12L))
input<-structure(list(lhs = c("32,39,6", "39,6,65", "14,16,26", "16,20,4",
"16,26,33", "53"), rhs = c(65L, 32L, 15L, 26L, 4L, 31L)),
.Names = c("lhs", "rhs"), class = "data.frame", row.names = c(NA, -6L))
#new left hand side
newlhs<-sapply(as.character(input$lhs), function(x){
strs<-unlist(strsplit(x, ","))
f<-factor(strs, levels=df$id, labels=df$name)
paste(f, collapse = ", ")
})
#new right hand side
newrhs<-sapply(as.character(input$rhs), function(x){
strs<-unlist(strsplit(x, ","))
f<-factor(strs, levels=df$id, labels=df$name)
paste(f, collapse = ", ")
})
answer<-data.frame(newlhs, newrhs)
row.names(answer)<-NULL #remove rownames
Upvotes: 2
Reputation: 39154
A solution using dplyr and tidyr. dat
is the final output. The key is to use separate_rows
to expand the lhs
and then conduct left_join
twice.
library(dplyr)
library(tidyr)
dat <- dat1 %>%
separate_rows(lhs, convert = TRUE) %>%
left_join(dat2, by = c("lhs" = "id")) %>%
left_join(dat2, by = c("rhs" = "id")) %>%
drop_na(name.x) %>%
group_by(name.y) %>%
summarise(lhs = paste0(name.x, collapse = ", ")) %>%
ungroup() %>%
select(lhs, rhs = name.y)
dat
# # A tibble: 6 x 2
# lhs rhs
# <chr> <chr>
# 1 butter, whole milk, yougurt condensed milk
# 2 sugar curd
# 3 rolls, coffee, soda flour
# 4 cream chese, butter, condensed milk meat spreads
# 5 flour, rolls, coffee soda
# 6 butter, condensed milk yougurt
DATA
dat1 <- read.table(text = "lhs rhs
'32,39,6' 65
'39,6,65' 32
'14,16,26' 15
'16,20,4' 26
'16,26,33' 4
53 31 ",
stringsAsFactors = FALSE, header = TRUE)
dat2 <- read.table(text = "id name
4 yougurt
6 coffee
14 'cream chese'
15 'meat spreads'
16 butter
20 'whole milk'
26 'condensed milk'
31 curd
32 flour
39 rolls
53 sugar
65 soda",
header = TRUE, stringsAsFactors = FALSE)
Upvotes: 4