Reputation: 933
How can i compare two datasets and find the common gene names, provided if CNA and chr of both datasets are same
dt1
CNA chr Genes
gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss 5 RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain 2 PDIA5,SEMA5B
dt2
CNA chr Genes
gain 5 Sall3,Nfatc1,SNORA5,SNORA5
gain 5 RNU6-866P,OR8J1,OR8K3,OR8K3
gain 2 PDIA5,DCC
expected output
df3
CNA chr Genes
gain 5 Sall3,Nfatc1
gain 2 PDIA5
Im sure this is trivial question, but would love to have suggestions to help me a bit.
Upvotes: 4
Views: 168
Reputation: 31171
Here is an approach:
library(data.table)
df2 = setDT(df2)[,list(Genes=paste0(Genes, collapse=',')),by=list(CNA, chr)]
res = setkey(setDT(df1), CNA, chr)[df2]
# CNA chr Genes Genes.1
#1: gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1 Sall3,Nfatc1,SNORA5,SNORA5,RNU6-866P,OR8J1,OR8K3,OR8K3
#2: gain 2 PDIA5,SEMA5B PDIA5,DCC
res[, paste0(intersect(strsplit(Genes,',')[[1]], strsplit(Genes.1,',')[[1]]), collapse=',')
, by=list(CNA, chr)]
# CNA chr V1
#1: gain 5 Sall3,Nfatc1
#2: gain 2 PDIA5
Data:
df1 = structure(list(CNA = c("gain", "gain", "loss"), chr = c(2L, 5L,
5L), Genes = c("PDIA5,SEMA5B", "Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1",
"RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5")), .Names = c("CNA", "chr",
"Genes"), class = "data.frame", row.names = c(NA, -3L))
df2 = structure(list(CNA = c("gain", "gain", "gain"), chr = c(5L, 5L,
2L), Genes = c("Sall3,Nfatc1,SNORA5,SNORA5", "RNU6-866P,OR8J1,OR8K3,OR8K3",
"PDIA5,DCC")), .Names = c("CNA", "chr", "Genes"), class = "data.frame", row.names = c(NA,
-3L))
Upvotes: 3
Reputation: 20811
Not very elegant but
dt1 <- read.table(header = TRUE, text = "CNA chr Genes
gain 5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss 5 RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain 2 PDIA5,SEMA5B", stringsAsFactors = FALSE)
dt2 <- read.table(header = TRUE, text= "CNA chr Genes
gain 5 Sall3,Nfatc1,SNORA5,SNORA5
gain 5 RNU6-866P,OR8J1,OR8K3,OR8K3
gain 2 PDIA5,DCC", stringsAsFactors = FALSE)
f <- function(x, y, z = 'Genes') {
## split the genes out and find common ones
xx <- strsplit(x[, z], ',')
yy <- strsplit(y[, z], ',')
res <- lapply(seq_along(xx), function(ii)
intersect(xx[[ii]], yy[[ii]]))
## combine back into one of the data frames
res <- lapply(res, paste, collapse = ',')
res <- cbind(x[, 1:2], Genes = do.call('rbind', res))
## make sure the chr and alterations are the same and only return those
idx <- sapply(1:nrow(x), function(ii) all(x[ii, 1:2] == y[ii, 1:2]))
res[idx, ]
}
f(dt1, dt2)
# CNA chr Genes
# 1 gain 5 Sall3,Nfatc1
# 3 gain 2 PDIA5
Upvotes: 2