Reputation: 53
I'm working with 2 dataframes, as a sample:
DF1 is the main table with a column containing equations which usually have variables
DF1 <- read.table(text =
"Unit Year Equation
1 2020 'x+2*y'
1 2021 'x+2*y'
1 2022 'x+2*y'
2 2020 'x'
3 2020 'max(y^2, y+2*z)'
3 2021 'max(y^2, y+2*z)'
4 2020 '5'
5 2020 '(x/y)+z'",
header = TRUE, stringsAsFactors = FALSE)
DF2 is the reference or lookup table which assigns Values to the Variables given certain Years
DF2 <- read.table(text =
"Year Variable Value
2020 x 10
2021 x 15.5
2022 x 50
2020 y 1
2021 y 2
2022 y 3.5
2020 z 20
2021 z 34
2022 z 11",
header = TRUE, stringsAsFactors = FALSE)
The goal is to match the variables and the years between the 2 dataframes so that the following table could be derived after applying eval(parse(text=Equation)) or anything similar:
Unit Year Equation
1 2020 12
1 2021 19.5
1 2022 57
2 2020 10
3 2020 41
3 2021 70
4 2020 5
5 2020 30
Currently I'm using a for and if-else loop to match the Years and replace the Variables row-by-row. It works okay, but running it has become very slow since DF1 could contain thousands of rows with several variables. Are there other functions I could use to achieve the same output?
Edit - Adding in the loop mentioned to help with comparison:
library(dplyr)
library(reshape2)
DF2 = dcast(DF2, Year~Variable, value.var='Value')
#Adding in this line to avoid replacing "x" in "max":
DF1$Equation = gsub("max","placeholder",DF1$Equation)
for(i in 1:nrow(DF1)) {
for (j in 1:nrow(DF2)) {
if (DF1[i,]$Year==DF2[j,]$Year) {
#Every variable would be declared here:
DF1[i,]$Equation = gsub("x",DF2[j,]$x,DF1[i,]$Equation)
DF1[i,]$Equation = gsub("y",DF2[j,]$y,DF1[i,]$Equation)
DF1[i,]$Equation = gsub("z",DF2[j,]$z,DF1[i,]$Equation)
}
}
}
#Returning the function:
DF1$Equation = gsub("placeholder","max",DF1$Equation)
Results_DF1 = DF1 %>% rowwise() %>%
mutate(Equation = eval(parse(text=Equation)))
Upvotes: 5
Views: 212
Reputation: 39667
A possibility in base. First convert DF2 into a named list per year using setNames
and split
. Then eval
the Equations with the Values for the specific year using mapply
.
L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
x <- mapply(function(e, y) {eval(str2lang(e), L2[[y]])},
DF1$Equation, match(DF1$Year, names(L2)))
cbind(DF1[1:2], Equation=x)
# Unit Year Equation
#1 1 2020 12.0
#2 1 2021 19.5
#3 1 2022 57.0
#4 2 2020 10.0
#5 3 2020 41.0
#6 3 2021 70.0
#7 4 2020 5.0
#8 5 2020 30.0
In case there are many same equations it could bring some improvements to parse only unique
ones.
L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
E <- unique(DF1$Equation)
ES <- lapply(E, str2lang)
x <- mapply(function(e, y) {eval(ES[[e]], L2[[y]])},
match(DF1$Equation, E), match(DF1$Year, names(L2)))
cbind(DF1[1:2], Equation=x)
And using the original equations without *
between number and variable using gsub
:
DF1a <- read.table(text =
"Unit Year Equation
1 2020 'x+2y'
1 2021 'x+2y'
1 2022 'x+2y'
2 2020 'x'
3 2020 'max(y^2, y+2z)'
3 2021 'max(y^2, y+2z)'
4 2020 '5'
5 2020 '(x/y)+z'",
header = TRUE, stringsAsFactors = FALSE)
x <- mapply(function(e, y) {eval(parse(text=e), L2[[y]])},
gsub("(\\d)([a-z])", "\\1*\\2", DF1a$Equation), as.character(DF1a$Year))
Compare timings and memory usage.
years <- 20
units <- 1000
DF1 <- expand.grid( stringsAsFactors = FALSE, Unit = 1:units, Year = 1:years,
Equation = c("x+2*y", "x", "max(y^2, y+2*z)", "5", "(x/y)+z"))
DF2 <- expand.grid(Year = 1:years, Variable = c("x", "y", "z"))
set.seed(42)
DF2$Value <- rnorm(nrow(DF2))
library(dplyr)
library(rlang)
library(tidyr)
onyambu <- function() {
left_join(DF1, DF2, 'Year') %>%
pivot_wider(c(Unit,Year,Equation),Variable, values_from = Value) %>%
rowwise() %>%
mutate(a = eval(parse(text = Equation)))
}
anoushiravan <- function() {
DF1 %>%
left_join(DF2 %>%
pivot_wider(names_from = Variable, values_from = Value),
by = "Year") %>%
rowwise() %>%
mutate(Result = eval(parse_expr(Equation)))
}
jonas <- function() {
df1YearRows <- split(1:NROW(DF1),DF1$Year)
df2YearRows <- split(1:NROW(DF2), DF2$Year)
equationValues <- setNames(lapply(names(df1YearRows), function(yearId) {
fittingVariables <- DF2[df2YearRows[[yearId]],c("Variable","Value")]
fittingVariablesAndvaluesList <- as.list(setNames(fittingVariables$Value,fittingVariables$Variable))
equationToEvaluate <- DF1$Equation[df1YearRows[[yearId]]]
sapply(equationToEvaluate, function(eqTxt) {
eval(parse(text = eqTxt), fittingVariablesAndvaluesList)
})
}),names(df1YearRows))
equationValueColumn <- Reduce(
f = function(oldColumn,newYearId) {
oldColumn[df1YearRows[[newYearId]]] <- equationValues[[newYearId]]
oldColumn
},
x = names(equationValues),
init = numeric(NROW(DF1))
)
resultDf <- data.frame(Year = DF1$Year,
Equation = DF1$Equation,
Equationvalue = equationValueColumn)
resultDf
}
GKi <- function() {
L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
x <- mapply(function(e, y) {eval(str2lang(e), L2[[y]])},
DF1$Equation, match(DF1$Year, names(L2)))
cbind(DF1[1:2], Equation=x)
}
GKi2 <- function() {
L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
E <- unique(DF1$Equation)
ES <- lapply(E, str2lang)
x <- mapply(function(e, y) {eval(ES[[e]], L2[[y]])},
match(DF1$Equation, E), match(DF1$Year, names(L2)))
cbind(DF1[1:2], Equation=x)
}
bench::mark(check = FALSE
, onyambu()
, anoushiravan()
, jonas()
, GKi()
, GKi2()
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 onyambu() 4.29s 4.29s 0.233 138.04MB 6.06 1 26
#2 anoushiravan() 7.12s 7.12s 0.140 100.23MB 3.09 1 22
#3 jonas() 3.49s 3.49s 0.287 89.98MB 3.44 1 12
#4 GKi() 863.95ms 863.95ms 1.16 6.6MB 3.47 1 3
#5 GKi2() 324.69ms 325.58ms 3.07 7.49MB 4.61 2 3
Upvotes: 2
Reputation: 21918
I just noticed you have edited your DF1
so I used it instead and there is no need for edit from my part:
library(dplyr)
library(rlang)
DF1 %>%
left_join(DF2 %>%
pivot_wider(names_from = Variable, values_from = Value),
by = "Year") %>%
rowwise() %>%
mutate(Result = eval(parse_expr(Equation)))
# A tibble: 8 x 7
# Rowwise:
Unit Year Equation x y z Result
<int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 2020 x+2*y 10 1 20 12
2 1 2021 x+2*y 15.5 2 34 19.5
3 1 2022 x+2*y 50 3.5 11 57
4 2 2020 x 10 1 20 10
5 3 2020 max(y^2, y+2*z) 10 1 20 41
6 3 2021 max(y^2, y+2*z) 15.5 2 34 70
7 4 2020 5 10 1 20 5
8 5 2020 (x/y)+z 10 1 20 30
Upvotes: 3
Reputation: 1810
To give a base
R solution, you could do the following to get your desired resulting dataframe:
df1YearRows <- split(1:NROW(DF1),DF1$Year)
df2YearRows <- split(1:NROW(DF2), DF2$Year)
equationValues <- setNames(lapply(names(df1YearRows), function(yearId) {
fittingVariables <- DF2[df2YearRows[[yearId]],c("Variable","Value")]
fittingVariablesAndvaluesList <- as.list(setNames(fittingVariables$Value,fittingVariables$Variable))
equationToEvaluate <- DF1$Equation[df1YearRows[[yearId]]]
sapply(equationToEvaluate, function(eqTxt) {
eval(parse(text = eqTxt), fittingVariablesAndvaluesList)
})
}),names(df1YearRows))
equationValueColumn <- Reduce(
f = function(oldColumn,newYearId) {
oldColumn[df1YearRows[[newYearId]]] <- equationValues[[newYearId]]
oldColumn
},
x = names(equationValues),
init = numeric(NROW(DF1))
)
resultDf <- data.frame(Year = DF1$Year,
Equation = DF1$Equation,
Equationvalue = equationValueColumn)
resultDf
resulting in
Year Equation Equationvalue
1 2020 x+2*y 12.0
2 2021 x+2*y 19.5
3 2022 x+2*y 57.0
4 2020 x 10.0
5 2020 max(y^2, y+2*z) 41.0
6 2021 max(y^2, y+2*z) 70.0
7 2020 5 5.0
8 2020 (x/y)+z 30.0
Upvotes: 2
Reputation: 79228
You could do:
left_join(DF1, DF2, 'Year') %>%
pivot_wider(c(Unit,Year,Equation),Variable, values_from = Value) %>%
rowwise() %>%
mutate(a = eval(parse(text = Equation)))
Unit Year Equation x y z a
<int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 2020 x+2*y 10 1 20 12
2 1 2021 x+2*y 15.5 2 34 19.5
3 1 2022 x+2*y 50 3.5 11 57
4 2 2020 x 10 1 20 10
5 3 2020 max(y^2, y+2*z) 10 1 20 41
6 3 2021 max(y^2, y+2*z) 15.5 2 34 70
7 4 2020 5 10 1 20 5
8 5 2020 (x/y)+z 10 1 20 30
Upvotes: 5