Achal Neupane
Achal Neupane

Reputation: 5719

R code to work on genotype data

I have this data called mydf.

I need to match the letters (DNA letters) in column REF and ALT with the colnames(x) ("A","T","G","C") and get the corresponding numerical values pasted together as "REF,ALT".

However, there are some lines where I have "snp:+[0-9]" and "flat$" in TYPE column.

Now for "flat$" lines I want to:

  1. sum the ALT values from as many as "snp:+[0-9]" of the corresponding "start" ids, including the flat line itself, if the ALT letters are unique (please see the script enclosed in curly brackets for one flat line)
  2. paste that ALT value again as "REF,ALT" (REF value will be same for both "snp:+[0-9]" and "flat$" with same start ID)
  3. get the output as shown in result.

I have done this for one flat line,but I need help making the function for flatcase so that it will do the same for all flat lines.

How can I make a function to do this for flatcase?

Code

normalCase <- function(x, ns) {
      ref.idx <- which(ns == "REF")
      ref.allele <- x[ref.idx]
      ref.count <- x[which(ns == ref.allele)]

      alt.idx <- which(ns == "ALT")
      alt.allele <- x[alt.idx]
      alt.count <- x[which(ns == alt.allele)]

      paste(ref.count, alt.count, sep=",")
    }



    flatcase??{

     g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
     myt<-x[g,]
     x[g,"ALT"]
     unique(x[g,"ALT"])
     c<-unique(x[g,"ALT"])
     flat<-myt[grepl("flat$",myt[,"TYPE"]),]
     c<-unique(x[g,"ALT"])
    alt.count<- sum(as.numeric(flat[c]))
    }

    calculateAD <- function(x, mat, ns) {
      if (grepl("flat$", x[which(ns == 'TYPE')])) {
        flatCase(x, mat, ns)
      } else {
        normalCase(x, ns)
      }
    }


    bamAD <- function(x) {
       new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
      colnames(new.x)[ncol(new.x)] <- "bam.AD"
      new.x      
    }

The function I have tried for flatCase is:

flatCase <- function(x, mat, ns) {
  id.idx <- which(ns == 'start')
  type.idx <- which(ns == 'TYPE')
  ref.idx <- which(ns == 'REF')
  alt.idx <- which(ns == 'ALT')


  id <- x[id.idx]
  #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
  #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
  m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
  #flat<-mat[grepl("flat$",mat[, type.idx]),]
  ref.allele <- x[ref.idx]
  ref.count<-x[which(ns == ref.allele)]


  alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
  paste(ref.count, alt.count, sep=",") 
}

mydf

x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE 
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))

Result:

       start           A    T    G    C    REF ALT TYPE              bam.AD  
 [1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp"             "59,29" 
 [2,] "chr5:8529759"  "24" " 1" "28" "41" "G" "C" "snp"             "28,41" 
 [3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp"             "49,41" 
 [4,] "chr18:547375"  "94" " 1" "51" "67" "G" "C" "snp"             "51,67" 
 [5,] "chr8:5952145"  "27" "80" "25" "96" "T" "T" "snp"             "80,80" 
 [6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp"             "68,68" 
 [7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921"     "49,15" 
 [8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921"     "49,79" 
 [9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"  
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55" 
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344"     "13,42" 
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp"             "18,73" 
[14,] "chr4:7799768"  "36" "28" " 1" "16" "C" "A" "snp"             "16,36" 
[15,] "chr3:9141263"  "27" "41" "93" "90" "A" "A" "snp"             "27,27" 

Upvotes: 0

Views: 355

Answers (2)

Ricky
Ricky

Reputation: 4686

Another approach:

# create dataframe
mydf <- as.data.frame(x, stringsAsFactors=FALSE)
# create temporary values based on REF and ALT
mydf$REFval <- diag(as.matrix(mydf[, mydf$REF]))
mydf$ALTval <- diag(as.matrix(mydf[, mydf$ALT]))

Here in the next step, you said to sum ALT "if the ALT letters are unique" but didn't specify which value to use if the ALT is the same but the value is different. It didn't matter in your sample data set since the values are the same, so in my code below I assumed the last ALT value to be used.

# sum up ALT values for all start ID
require(dplyr)
mydfs <- mydf %>% group_by(start, ALT) %>%
  summarize(ALTkeep=last(ALTval)) %>%  # assume keep last one if same ALT
  group_by(start) %>%
  summarize(ALTflat=sum(as.numeric(ALTkeep)))

# merge back into main dataframe
mydf <- left_join(mydf, mydfs)
# select ALT value for bam.AD depending on "flat$" in TYPE
mydf$bam.AD <- with(mydf,
  paste(REFval, ifelse(grepl("flat$", TYPE), ALTflat, ALTval), sep=","))

# optional clean up of temporary values
mydf <- mydf[, !(names(mydf) %in% c("REFval", "ALTval", "ALTflat"))]

The output as you wanted

                                   start  A  T  G  C REF ALT            TYPE bam.AD
1                          chr20:5363934 95 29 14 59   C   T             snp  59,29
2                           chr5:8529759 24  1 28 41   G   C             snp  28,41
3                          chr14:9620689 65 49 41 96   T   G             snp  49,41
4                           chr18:547375 94  1 51 67   G   C             snp  51,67
5                           chr8:5952145 27 80 25 96   T   T             snp  80,80
6                          chr14:8694382 68 94 26 30   A   A             snp  68,68
7                          chr16:2530921 49 15 79 72   A   T     snp:2530921  49,15
8                          chr16:2530921 49 15 79 72   A   G     snp:2530921  49,79
9                          chr16:2530921 49 15 79 72   A   T snp:2530921flat  49,94
10                         chr16:2533924 42 13 19 52   G   T snp:2533924flat  19,13
11                         chr16:2543344  4 13 13 42   G   T snp:2543344flat  13,55
12                         chr16:2543344 42 23 13 42   G   A     snp:2543344  13,42
13                         chr14:4214117 73 49 18 77   G   A             snp  18,73
14                          chr4:7799768 36 28  1 16   C   A             snp  16,36
15                          chr3:9141263 27 41 93 90   A   A             snp  27,27

Upvotes: 2

mathematical.coffee
mathematical.coffee

Reputation: 56915

Here's a way to do it all, vectorised.

First, note that REF is the same regardless of type. We can look it up quickly by using REF as a coordinate into the matrix, so e.g. row 1 has REF C, so if we look up coordinates (1, "C") we get the REF value for that row.

# the REFs are the same regardless of TYPE
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]

Have a look at cbind(1:nrow(x), x[, 'REF']): this is just a list of coordinates (row number, REF) and we use it to look up the REF number.

Then we do the same for ALT:

alt <- x[cbind(1:nrow(x), x[, 'ALT'])]

However we have to make sure that if the type is 'flat', we add all the other ALTs to the 'flat' row's ALT (only the unique ones, as you say).

First, work out which rows are flat:

which.flat <- grep('flat$', x[, 'TYPE'])

Next, for each flat row, look up ALT of the other rows with the same 'start' (that's the x[, 'start'] == x[i, 'start'] bit), and exclude rows that have duplicate ALT (that's the x[, 'ALT'] != x[i, 'ALT'] bit). Here i is the index of the current flat line. Add them all to the flat line's ALT. The sapply just vectorises this all for each flat line.

# add the other alts to the alt of the 'flat' line.
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
    function (i) {
        sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
             x[, 'ALT'] != x[i, 'ALT'] ]))
    })

now we just paste together:

x <- cbind(x, bam.AD=paste(ref, alt, sep=','))

The result is the same as yours except for row 10 where I believe you have made a mistake - there's only one row with "chr16:2533924" and its ALT is "T" (value 13), so bam.AD is "19,13" (you have "19,42" as if the ALT was "A", but it isn't).


If you must stick with the function form in your question (quite slow & inefficient!), it's essentially the same as what I have done (hence why you can do it without the apply call and skip the loop entirely):

flatCase <- function(x, mat, ns) { # get alt of the flat row alt <- as.numeric(x[x['ALT']])

# get the other rows with the same 'start' and different 'ALT'
xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
if (nrow(xx) > 0) {
  # grab all the alts as done before
  rownames(xx) <- 1:nrow(xx)
  alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
 }

ref <- x[x['REF']]
return(paste(ref, alt, sep=','))
}

However as mentioned before, if you vectorise it the entire code you have above reduces to just a few lines, and is faster:

newBamAD <- function (x) {
    # the version above
    rownames(x) <- 1:nrow(x)
    ref <- x[cbind(1:nrow(x), x[, 'REF'])]
    alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
    which.flat <- grep('flat$', x[, 'TYPE'])
    alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
        function (i) {
            sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
                 x[, 'ALT'] != x[i, 'ALT'] ]))
        })
    cbind(x, bam.AD=paste(ref, alt, sep=','))
}

library(rbenchmark)
benchmark(
  bamAD=bamAD(x),
  newBamAD=newBamAD(x)
)
#       test replications elapsed relative user.self sys.self user.child sys.child
# 1    bamAD          100   0.082    3.905     0.072    0.004          0         0
# 2 newBamAD          100   0.021    1.000     0.020    0.000          0         0

the vectorised version is almost 4x faster.

Upvotes: 2

Related Questions