Reputation: 5618
This is a pretty simple bioinformatics implementation of a self-alignment matrix with R. It loops twice over a string sequence with a sliding window operator frag1
and compares to every fra2
of the same sequence.
The below code is extremely slow, not sure how to speed it up with standard R syntax. In python this would be super fast, but in R it takes 1 minute. I already reduced calculation by half by assigning i,j
and j,i
simultanously.
Any speed up ideas?
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
windowSize<-24;
matrixSize<-nchar(sequence) - windowSize;
defaultValue = -10000000000;
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize);
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize);
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize);
totalScore = 0;
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize;
}
scoreMatrix[i,j] = totalScore;
scoreMatrix[j,i] = totalScore;
}
}
}
return(scoreMatrix);
Upvotes: 1
Views: 202
Reputation: 4836
You original code on my not so new laptop (Lenovo Yoga 2 from 2014, R3.4) run in 17 seconds. After not so heavy optimization this time was reduced to 2 seconds. I just converted sequence
to vector at the beginning of the calculations. After that I changed indexing by names in BLOSUM50
to indexing by numeric index. It resulted in 0.5 seconds of execution time. See code and benchmarks below.
fun = function(sequence){
windowSize<-24
matrixSize<-nchar(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize)
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize)
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun2 = function(sequence){
windowSize<-24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize<-length(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = sequence[i:(i+windowSize)]
for(j in 1:matrixSize){
frag2 = sequence[j:(j+windowSize)]
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[frag1[x],frag2[x]] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun3 = function(sequence){
windowSize = 24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize = length(sequence) - windowSize
scoreMatrix = matrix(NA, nrow = matrixSize, ncol = matrixSize)
sequence_index = match(sequence, colnames(BLOSUM50))
for(i in seq_len(matrixSize)){
frag1 = sequence_index[i:(i+windowSize - 1)]
for(j in seq_len(matrixSize)){
frag2 = sequence_index[j:(j+windowSize - 1)]
if(is.na(scoreMatrix[i,j])){
totalScore = sum(BLOSUM50[(frag2 - 1)*NROW(BLOSUM50) + frag1])/windowSize
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
library(microbenchmark)
microbenchmark(
original = fun(sequence),
fun2 = fun2(sequence),
fun3 = fun3(sequence),
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# original 16395.2108 16660.3295 17533.8563 16755.9680 17594.3596 20263.4137 5
# fun2 1992.7731 2010.4031 2027.7953 2015.9592 2034.9022 2084.9390 5
# fun3 472.0641 481.9267 496.2656 498.3259 506.6357 522.3755 5
Upvotes: 2