Reputation: 557
Am struggling with operation as my datasets are very large and i have provided an example of what i want.
I have two dataframes.
df1 - contains sampling-derived iterations for each parameter of a variable defined as the column name (10,000 rows)
df2 - contains the actual value of each of the variable defined as the column name (4,000 rows)
I want a df3 which is effectively the multiplication of each row of df2 by df1 and would therefore be 4000*10000 rows
As a short example i have provided a minimal example of df1 and df2. I have provided the output that i would be looking at shown in df3.
df1 <- structure(list(intercept = c(3.4, 3.6, 3.7), age = c(0.08, 0.05,
0.06), male = c(0.07, 0.06, 0.07)), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(id = structure(1:2, .Label = c("a", "b"), class = "factor"),
intercept = c(1L, 1L), age = c(40L, 45L), male = 1:0), class = "data.frame", row.names = c(NA,
-2L))
df3 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), intercept = c(3.4, 3.6, 3.7, 3.4, 3.6,
3.7), age = c(3.2, 2, 2.4, 3.6, 2.25, 2.7), male = c(0.07, 0.06,
0.07, 0, 0, 0)), class = "data.frame", row.names = c(NA, -6L))
Can somebody point me to an efficient way to do this in R?
Upvotes: 2
Views: 211
Reputation: 101044
You can perform row-wise Kronecker product (from package MGLM
) like below
out <- data.frame(id = rep(df2$id,each=nrow(df1)),
t(MGLM::kr(t(df2[-1]),t(df1))))
such that
> out
id intercept age male
1 a 3.4 3.20 0.07
2 a 3.6 2.00 0.06
3 a 3.7 2.40 0.07
4 b 3.4 3.60 0.00
5 b 3.6 2.25 0.00
6 b 3.7 2.70 0.00
Benchmarking (so far the approach by @Sotos is the winner)
df1 <- do.call(rbind,replicate(500,structure(list(intercept = c(3.4, 3.6, 3.7), age = c(0.08, 0.05,
0.06), male = c(0.07, 0.06, 0.07)), class = "data.frame", row.names = c(NA,
-3L)),simplify = F))
df2 <- do.call(rbind,replicate(100,structure(list(id = structure(1:2, .Label = c("a", "b"), class = "factor"),
intercept = c(1L, 1L), age = c(40L, 45L), male = 1:0), class = "data.frame", row.names = c(NA,
-2L)),simplify = F))
library(MGLM)
library(purrr)
f_ThomasIsCoding <- function() {
data.frame(id = rep(df2$id,each=nrow(df1)),
t(MGLM::kr(t(df2[-1]),t(df1))))
}
f_tmfmnk_1 <- function() {
map_dfr(.x = asplit(df2[-1], 1), ~ sweep(df1, 2, FUN = `*`, .x))
}
f_tmfmnk_2 <- function() {
data.frame(do.call(rbind, lapply(asplit(df2[-1], 1), function(x) sweep(df1, 2, FUN = `*`, x))),
id = rep(df2$id, each = nrow(df1)))
}
f_RonakShah <- function() {
new1 <- df1[rep(seq(nrow(df1)), nrow(df2)), ]
new2 <- df2[rep(seq(nrow(df2)), each = nrow(df1)),]
out <- cbind(new2[1], new1 * new2[-1])
rownames(out) <- NULL
out
}
f_Sotos <- function() {
data.frame(id = rep(df2$id, each = nrow(df1)),
mapply(function(x, y)c(outer(x, y, `*`)), df1, df2[-1])
)
}
bmk <- microbenchmark(times = 20,
unit = "relative",
f_ThomasIsCoding(),
f_tmfmnk_1(),
f_tmfmnk_2(),
f_RonakShah(),
f_Sotos())
which gives
> bmk
Unit: relative
expr min lq mean median uq max neval
f_ThomasIsCoding() 1.186124 1.218201 1.197346 1.321731 1.042721 1.077854 20
f_tmfmnk_1() 7.594520 7.572723 4.539698 7.297610 2.437621 3.446436 20
f_tmfmnk_2() 9.670286 12.212220 6.583183 11.888061 3.370593 4.088534 20
f_RonakShah() 28.918724 28.861437 16.707258 27.889563 8.403161 11.668252 20
f_Sotos() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
Upvotes: 1
Reputation: 39858
One option involving purrr
could be:
map_dfr(.x = asplit(df2[-1], 1), ~ sweep(df1, 2, FUN = `*`, .x))
intercept age male
1 3.4 3.20 0.07
2 3.6 2.00 0.06
3 3.7 2.40 0.07
4 3.4 3.60 0.00
5 3.6 2.25 0.00
6 3.7 2.70 0.00
If also the id column is important:
data.frame(map_dfr(.x = asplit(df2[-1], 1), ~ sweep(df1, 2, FUN = `*`, .x)),
id = rep(df2$id, each = nrow(df1)))
intercept age male id
1 3.4 3.20 0.07 a
2 3.6 2.00 0.06 a
3 3.7 2.40 0.07 a
4 3.4 3.60 0.00 b
5 3.6 2.25 0.00 b
6 3.7 2.70 0.00 b
The same with base R
:
do.call(rbind, lapply(asplit(df2[-1], 1), function(x) sweep(df1, 2, FUN = `*`, x)))
Or:
data.frame(do.call(rbind, lapply(asplit(df2[-1], 1), function(x) sweep(df1, 2, FUN = `*`, x))),
id = rep(df2$id, each = nrow(df1)))
Upvotes: 0
Reputation: 388807
You could repeat rows in both the dataframes based on number of rows in other dataframe and multiply them directly
df1[rep(seq(nrow(df1)), nrow(df2)),] * df2[rep(seq(nrow(df2)), each = nrow(df1)),-1]
# intercept age male
#1 3.4 3.20 0.07
#2 3.6 2.00 0.06
#3 3.7 2.40 0.07
#1.1 3.4 3.60 0.00
#2.1 3.6 2.25 0.00
#3.1 3.7 2.70 0.00
To also get id
column
new1 <- df1[rep(seq(nrow(df1)), nrow(df2)), ]
new2 <- df2[rep(seq(nrow(df2)), each = nrow(df1)),]
out <- cbind(new2[1], new1 * new2[-1])
rownames(out) <- NULL
out
# id intercept age male
#1 a 3.4 3.20 0.07
#2 a 3.6 2.00 0.06
#3 a 3.7 2.40 0.07
#4 b 3.4 3.60 0.00
#5 b 3.6 2.25 0.00
#6 b 3.7 2.70 0.00
Upvotes: 0
Reputation: 51582
Another idea via base R using outer
,
data.frame(id = rep(df2$id, each = nrow(df1)),
mapply(function(x, y)c(outer(x, y, `*`)), df1, df2[-1])
)
which gives,
id intercept age male 1 a 3.4 3.20 0.07 2 a 3.6 2.00 0.06 3 a 3.7 2.40 0.07 4 b 3.4 3.60 0.00 5 b 3.6 2.25 0.00 6 b 3.7 2.70 0.00
Upvotes: 2