Reputation: 33
I have the following data frames:
# df1
id cg_v
1 a
2 b
3 a b
4 b c
5 b c d
6 d
# df2
id cg
1 a
2 b
3 a
3 b
4 b
4 c
5 b
5 c
5 d
6 d
I need to add a column to df1
that contains the mean covariance computed across each pair of elements in cg_v
. If cg_v
contains only one element, then I would like the new column to contain its variance.
I can get a covariance matrix by cov(crossprod(table(df2)))
# a b c d
a 0.9166667 0.0000000 -0.5833333 -0.6666667
b 0.0000000 2.0000000 1.0000000 0.0000000
c -0.5833333 1.0000000 0.9166667 0.3333333
d -0.6666667 0.0000000 0.3333333 0.6666667
What do I do from here?
The end result should be like this:
# df1
id cg_v cg_cov
1 a 0.9166667
2 b 2.0000000
3 a b 0.0000000
4 b c 1.0000000
5 b c d 0.4444444 # This is equal to (1.0000000 + 0.3333337 + 0.0000000)/3
6 d 0.6666667
Code to generate df1
and df2
:
df1 <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d")),
.Names = c("id", "cg_v"),
class = "data.frame", row.names = c(NA, -6L))
df2 <- structure(list(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d")),
.Names = c("id", "cg"),
class = "data.frame", row.names = c(NA, -10L))
Upvotes: 0
Views: 252
Reputation: 1030
I think I found a solution for this problem using data.tables and reshape. What do you want to do with the three letters b c d? I assumed that you want to have the covariance of the first two letters:
require(reshape)
require(data.table)
dt1 <- data.table(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
dt2 <- data.table(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- data.table(melt(cov(crossprod(table(df2)))))
dt1 <- cbind(dt1, t(sapply(strsplit(as.character(df1$cg_v), " "), function(x)x[1:2])))
#replace the na with the first colomn
dt1[is.na(V2), V2 := V1]
# Merge them on two columns
setkey(dt1, "V1", "V2")
setkey(cov_dt, "X1", "X2")
result <- cov_dt[dt1]
> result[,.(id, cg_v, value)]
id cg_v value
1: 1 a 0.9166667
2: 3 a b 0.0000000
3: 2 b 2.0000000
4: 4 b c 1.0000000
5: 5 b c d 1.0000000
6: 6 d 0.6666667
Variant which also works if there are more than 2 letters (not the most efficient code):
require(reshape)
require(combinat)
df1 <- data.frame(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
df2 <- data.frame(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- cov(crossprod(table(df2)))
mat <- sapply(strsplit(as.character(df1$cg_v), " "), function(x) if(length(x) == 1){c(x,x)} else(x))
# Should be all minimal 2
sapply(mat, length) > 1
mat <- sapply(mat, function(x) matrix(combn(x,2), nrow = 2))
df1$cg_cov <- sapply(mat, function(x) mean(apply(x,2, function(x) cov_dt[x[1],x[2]])))
> df1
id cg_v cg_cov
1 1 a 0.9166667
2 2 b 2.0000000
3 3 a b 0.0000000
4 4 b c 1.0000000
5 5 b c d 0.4444444
6 6 d 0.6666667
Upvotes: 1