Kryo
Kryo

Reputation: 933

Comparing two data sets and find out common names

How can i compare two datasets and find the common gene names, provided if CNA and chr of both datasets are same

dt1

    CNA     chr   Genes
    gain    5     Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
    loss    5     RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
    gain    2     PDIA5,SEMA5B

dt2

    CNA     chr   Genes
    gain    5     Sall3,Nfatc1,SNORA5,SNORA5
    gain    5     RNU6-866P,OR8J1,OR8K3,OR8K3
    gain    2     PDIA5,DCC

expected output

df3

    CNA     chr   Genes
    gain    5     Sall3,Nfatc1
    gain    2     PDIA5

Im sure this is trivial question, but would love to have suggestions to help me a bit.

Upvotes: 4

Views: 168

Answers (2)

Colonel Beauvel
Colonel Beauvel

Reputation: 31171

Here is an approach:

library(data.table)

df2 = setDT(df2)[,list(Genes=paste0(Genes, collapse=',')),by=list(CNA, chr)]
res = setkey(setDT(df1), CNA, chr)[df2]

#    CNA chr                          Genes                                                Genes.1
#1: gain   5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1 Sall3,Nfatc1,SNORA5,SNORA5,RNU6-866P,OR8J1,OR8K3,OR8K3
#2: gain   2                   PDIA5,SEMA5B                                              PDIA5,DCC

res[, paste0(intersect(strsplit(Genes,',')[[1]], strsplit(Genes.1,',')[[1]]), collapse=',')
    , by=list(CNA, chr)]

#    CNA chr           V1
#1: gain   5 Sall3,Nfatc1
#2: gain   2        PDIA5

Data:

df1 = structure(list(CNA = c("gain", "gain", "loss"), chr = c(2L, 5L, 
5L), Genes = c("PDIA5,SEMA5B", "Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1", 
"RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5")), .Names = c("CNA", "chr", 
"Genes"), class = "data.frame", row.names = c(NA, -3L))

df2 = structure(list(CNA = c("gain", "gain", "gain"), chr = c(5L, 5L, 
2L), Genes = c("Sall3,Nfatc1,SNORA5,SNORA5", "RNU6-866P,OR8J1,OR8K3,OR8K3", 
"PDIA5,DCC")), .Names = c("CNA", "chr", "Genes"), class = "data.frame", row.names = c(NA, 
-3L))

Upvotes: 3

rawr
rawr

Reputation: 20811

Not very elegant but

dt1 <- read.table(header = TRUE, text = "CNA     chr   Genes
gain    5     Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss    5     RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain    2     PDIA5,SEMA5B", stringsAsFactors = FALSE)


dt2 <- read.table(header = TRUE, text= "CNA     chr   Genes
gain    5     Sall3,Nfatc1,SNORA5,SNORA5
gain    5     RNU6-866P,OR8J1,OR8K3,OR8K3
gain    2     PDIA5,DCC", stringsAsFactors = FALSE)



f <- function(x, y, z = 'Genes') {
  ## split the genes out and find common ones
  xx <- strsplit(x[, z], ',')
  yy <- strsplit(y[, z], ',')
  res <- lapply(seq_along(xx), function(ii)
    intersect(xx[[ii]], yy[[ii]]))

  ## combine back into one of the data frames
  res <- lapply(res, paste, collapse = ',')
  res <- cbind(x[, 1:2], Genes = do.call('rbind', res))

  ## make sure the chr and alterations are the same and only return those
  idx <- sapply(1:nrow(x), function(ii) all(x[ii, 1:2] == y[ii, 1:2]))
  res[idx, ]
}


f(dt1, dt2)

#    CNA chr        Genes
# 1 gain   5 Sall3,Nfatc1
# 3 gain   2        PDIA5

Upvotes: 2

Related Questions