El Dude
El Dude

Reputation: 5618

R function super slow

This is a pretty simple bioinformatics implementation of a self-alignment matrix with R. It loops twice over a string sequence with a sliding window operator frag1 and compares to every fra2 of the same sequence. The below code is extremely slow, not sure how to speed it up with standard R syntax. In python this would be super fast, but in R it takes 1 minute. I already reduced calculation by half by assigning i,j and j,i simultanously. Any speed up ideas?

sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'

if(!exists('BLOSUM50')){
    library(Biostrings)
    data(BLOSUM50)
    #BLOSUM50['A','N']
  }

  windowSize<-24;
  matrixSize<-nchar(sequence) - windowSize;
  defaultValue = -10000000000;
  scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize);

  for(i in 1:matrixSize){
    frag1 = substr(sequence,i,i+windowSize);

    for(j in 1:matrixSize){

      frag2 = substr(sequence,j,j+windowSize);

      totalScore = 0;

      if(scoreMatrix[i,j] == defaultValue){
        for(x in 1:windowSize){
          totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize;
        }

        scoreMatrix[i,j] = totalScore;
        scoreMatrix[j,i] = totalScore;
      }

    }
  }
  return(scoreMatrix);

Upvotes: 1

Views: 202

Answers (1)

Gregory Demin
Gregory Demin

Reputation: 4836

You original code on my not so new laptop (Lenovo Yoga 2 from 2014, R3.4) run in 17 seconds. After not so heavy optimization this time was reduced to 2 seconds. I just converted sequence to vector at the beginning of the calculations. After that I changed indexing by names in BLOSUM50 to indexing by numeric index. It resulted in 0.5 seconds of execution time. See code and benchmarks below.

fun = function(sequence){
     windowSize<-24
     matrixSize<-nchar(sequence) - windowSize
     defaultValue = -10000000000
     scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)

     for(i in 1:matrixSize){
         frag1 = substr(sequence,i,i+windowSize)

         for(j in 1:matrixSize){

             frag2 = substr(sequence,j,j+windowSize)

             totalScore = 0

             if(scoreMatrix[i,j] == defaultValue){
                 for(x in 1:windowSize){
                     totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize
                 }

                 scoreMatrix[i,j] = totalScore
                 scoreMatrix[j,i] = totalScore
             }

         }
     }

     scoreMatrix
 }

 fun2 = function(sequence){
     windowSize<-24
     sequence = unlist(strsplit(sequence, split = ""))
     matrixSize<-length(sequence) - windowSize
     defaultValue = -10000000000
     scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)

     for(i in 1:matrixSize){
         frag1 = sequence[i:(i+windowSize)]

         for(j in 1:matrixSize){

             frag2 = sequence[j:(j+windowSize)]

             totalScore = 0

             if(scoreMatrix[i,j] == defaultValue){
                 for(x in 1:windowSize){
                     totalScore = totalScore + BLOSUM50[frag1[x],frag2[x]] / windowSize
                 }

                 scoreMatrix[i,j] = totalScore
                 scoreMatrix[j,i] = totalScore
             }

         }
     }

     scoreMatrix
 }

 fun3 = function(sequence){
     windowSize = 24
     sequence = unlist(strsplit(sequence, split = ""))
     matrixSize = length(sequence) - windowSize
     scoreMatrix = matrix(NA, nrow = matrixSize, ncol = matrixSize)
     sequence_index = match(sequence, colnames(BLOSUM50))
     for(i in seq_len(matrixSize)){
         frag1 = sequence_index[i:(i+windowSize - 1)]

         for(j in seq_len(matrixSize)){

             frag2 = sequence_index[j:(j+windowSize - 1)]

             if(is.na(scoreMatrix[i,j])){
                 totalScore = sum(BLOSUM50[(frag2 - 1)*NROW(BLOSUM50) + frag1])/windowSize
                 scoreMatrix[i,j] = totalScore
                 scoreMatrix[j,i] = totalScore
             }

         }
     }

     scoreMatrix
 }


 if(!exists('BLOSUM50')){
     library(Biostrings)
     data(BLOSUM50)
     #BLOSUM50['A','N']
 }

 sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
 library(microbenchmark)
 microbenchmark(
     original = fun(sequence),
     fun2 = fun2(sequence),
     fun3 = fun3(sequence),
     times = 5
 )

 # Unit: milliseconds
 # expr          min         lq       mean     median         uq        max neval
 # original 16395.2108 16660.3295 17533.8563 16755.9680 17594.3596 20263.4137     5
 # fun2      1992.7731  2010.4031  2027.7953  2015.9592  2034.9022  2084.9390     5
 # fun3       472.0641   481.9267   496.2656   498.3259   506.6357   522.3755     5

Upvotes: 2

Related Questions