Kaleb
Kaleb

Reputation: 1022

Vectorize/Speed up Code with Nested For Loops

The code below generates the desired output. However, the lack of vectorizing means it runs very slowly. How can I speed it up?

I've put the dput results from part of some indicative data.

Input dputs

  1. StandRef input

    structure(list(id = structure(c(43L, 50L, 17L, 45L, 9L, 5L, 49L, 
    33L, 48L, 39L, 71L, 64L, 44L, 47L, 58L, 24L, 15L, 37L, 14L, 11L, 
    26L, 57L, 4L, 30L, 72L, 21L, 23L, 60L, 38L, 59L, 29L, 19L, 6L, 
    46L, 36L, 3L, 63L, 55L, 51L, 35L, 10L, 7L, 16L, 73L, 42L, 52L, 
    41L, 27L, 25L, 61L, 20L, 70L, 53L, 18L, 31L, 22L, 1L, 8L, 2L, 
    40L, 65L, 67L, 28L, 56L, 13L, 32L, 54L, 66L, 68L, 34L, 12L, 69L, 
    62L), .Label = c("ID 1009445", "ID 120763", "ID 133883", "ID 136398", 
    "ID 171850", "ID 192595", "ID 197597", "ID 216406", "ID 21888", 
    "ID 230940", "ID 23777", "ID 282791", "ID 306348", "ID 309745", 
    "ID 326928", "ID 344897", "ID 34974", "ID 350157", "ID 391831", 
    "ID 402479", "ID 43010", "ID 484078", "ID 484697", "ID 537134", 
    "ID 562259", "ID 562455", "ID 567042", "ID 572866", "ID 578945", 
    "ID 595683", "ID 59759", "ID 598460", "ID 603611", "ID 603757", 
    "ID 607991", "ID 60976", "ID 622720", "ID 646989", "ID 656144", 
    "ID 668807", "ID 669435", "ID 720522", "ID 740555", "ID 745499", 
    "ID 746001", "ID 783969", "ID 78979", "ID 792426", "ID 793541", 
    "ID 797860", "ID 806559", "ID 810517", "ID 826054", "ID 837609", 
    "ID 839287", "ID 867918", "ID 869788", "ID 875380", "ID 876870", 
    "ID 882220", "ID 893116", "ID 895909", "ID 899050", "ID 900143", 
    "ID 908100", "ID 912185", "ID 916371", "ID 916620", "ID 957879", 
    "ID 966195", "ID 993247", "ID 998911", "ID 999610"), class = "factor"), 
        region = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L), location = c(259090L, 559306L, 2227063L, 2369217L, 4026978L, 
        4211264L, 4679449L, 5105226L, 5106345L, 5344670L, 5473601L, 
        5476528L, 5871970L, 6461228L, 6700029L, 6708265L, 7639959L, 
        9297695L, 10254788L, 10328812L, 11102816L, 11568295L, 11720437L, 
        12843457L, 14012506L, 14156669L, 14632300L, 14641938L, 15298211L, 
        15468425L, 15534406L, 16279682L, 16699353L, 17226952L, 17320785L, 
        269017L, 453097L, 828833L, 954610L, 954842L, 1066378L, 1217332L,  
        1253530L, 1277716L, 1292857L, 1337952L, 1439657L, 1452989L, 
        1712345L, 1758035L, 2601630L, 2640359L, 2778095L, 3151129L, 
        3369931L, 3399080L, 3529525L, 3810217L, 3821120L, 3841588L, 
        3901557L, 4111633L, 4220440L, 4528632L, 4665450L, 5099307L, 
        5260242L, 5958770L, 5966356L, 6137405L, 6246065L, 6297231L, 
        6807949L)), .Names = c("id", "region", "location"), class = "data.frame", row.names = c(NA, 
    -73L))
    
  2. Two sample inputs

Sample 1

        structure(list(region = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 
        begin = c(0L, 2259252L, 5092077L, 9158205L, 0L, 135094L, 
        941813L, 5901391L, 6061324L), finish = c(2259252L, 5092077L, 
        9158205L, 20463033L, 135094L, 941813L, 5901391L, 6061324L, 
        7092402L), sed = c(3.98106154985726, 7.51649828394875, 5.15440228627995, 
        2.67456624889746, 7.54309412557632, 4.17413910385221, 7.47043058509007, 
        6.13362524658442, 1.00084994221106)), .Names = c("region", 
        "begin", "finish", "sed"), class = "data.frame", row.names = c(NA, 
        -9L))

Sample 2

        structure(list(region = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 
        begin = c(0L, 2253252L, 7091077L, 9120205L, 0L, 135094L, 
        941813L, 5901391L, 6061324L), finish = c(2253252L, 7091077L, 
        9120205L, 17463033L, 135094L, 941813L, 5901391L, 6061324L, 
        7092402L), sed = c(3.31830840984048, 1.38014704208403, 6.13049140975458, 
        2.10349875097134, 0.48170587509345, 0.13058713509175, 9.13509713513509, 
        6.13047153058701, 3.81734081501503)), .Names = c("region", 
        "begin", "finish", "sed"), class = "data.frame", row.names = c(NA, 
        -9L))

Unvectorized Code

matchLocationsToRegions <- function(path) {     
# get list of data files (around 500 of these; only dput of 2 given: sample262519 and sample252519)
setwd(path,sep="",collapse=NULL)
data_files <- list.files()

# read in template file with complete regional boundaries
standRef <- read.table(paste(path, "StandRef.txt",sep="",collapse=NULL), header=TRUE, sep="\t")

# pre-allocate a df with row dimensions of standRef and num of columns according to num of data files
sediment.df <- as.data.frame(matrix(NA,nrow=nrow(standRef),ncol=length(data_files)))
colnames(sediment.df) <- data_files
rownames(sediment.df) <- standRef[,1]

# create a counter for columns filled
col_counter <- 1    

for (file in data_files) {
    # read in current, processed data
    sample <- read.table(file, header=TRUE, sep="\t")          

    # pre-allocate vectors for sedimentation data vector
    sed <- rep(NA, nrow(standRef))

    # create a variable to track end boundary for a particular sample_ID
    end_tracker <- 1

    index <- unlist(lapply (unique(standRef$region), function(reg) {
             reg.filter <- which(standRef$region == reg)
             samp.filter <- which(sample$region == reg)
             samp.filter[cut(standRef$location[reg.filter],c(0L,sample$finish[samp.filter]),labels=F)]
        }))
    sed <- sample$sed[index]

    # fill in next, unfilled column of relevant df with data from relevant vector
    sediment.df[col_counter] <- sed   

    # update column counter variable 
    col_counter <- col_counter + 1
}       

# save df as a table
write.table(sediment.df,file="samples_sed.txt", row.names=TRUE, sep="\t") 
}

Running Rprof showed that "scan" "read.table" "matchLocationsToRegions" and "type.convert" "read.table" "matchLocationsToRegions" predominate runtime. Presumably, there's a bottleneck by for-looping over this line:

sample <- read.table(file, header=TRUE, sep="\t")      

Update: for loop over the regions has been replaced with much faster-executing code (h/t Simon Urbanek). However, the rest is quite slow.

Upvotes: 4

Views: 631

Answers (1)

Simon Urbanek
Simon Urbanek

Reputation: 13932

You can remove the loop easily:

sediment.df <- as.data.frame(lapply(data_files, function(file) {
   sample <- read.table(file, header=TRUE, sep="\t")          
   index <- unlist(lapply (unique(standRef$region), function(reg) {
         reg.filter <- which(standRef$region == reg)
         samp.filter <- which(sample$region == reg)
          samp.filter[cut(standRef$location[reg.filter],c(0L,sample$finish[samp.filter]),labels=F)]
    }))
    sample$sed[index]
}))
colnames(sediment.df) <- data_files
rownames(sediment.df) <- standRef[,1]

However, it is not unlikely that a lot of time is spent in read.table so you may consider a) using scan, b) creating just one file with all samples (e.g. use an extra column to define the sample) so you don't need to load many files.

Upvotes: 1

Related Questions