duHaas
duHaas

Reputation: 428

Data.table loop efficiency

I have two data.tables that have 34 columns, where the columns are exactly the same.

Month SpId1 SpId2 ... SpId33

Edit:Here is sample data using the reproduce function from Reproducible Example

AltSuitSp1 <- data.table(structure(list(Month = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,12L, 12L, 12L), .Label = c("1", "10", "11", "12", "2", "3", "4","5", "6", "7", "8", "9"), class = "factor"), SpdSpSuit = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), SpdIncSuit = c(0, 0, 0,0, 0, 0, 0, 0, 0, 0), SpdGrowSuit = c(0.4625, 0.4625, 0.4625, 0.4625, 0.4625, 0.4625, 0.4625,0.4625, 0.4625, 0.4625), RzbSpSuit = c(0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333), RzbIncSuit = c(0.34,0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34), RzbGrowSuit = c(0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333), FMSSpSuit = c(0.34,0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34), FMSIncSuit = c(0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425), FMSGrowSuit = c(0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333), BhsSpSuit = c(0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.233333333, 0.233333333, 0.233333333), BhsIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BhsGrowSuit = c(0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.233333333, 0.233333333, 0.233333333), BrtSpSuit = c(0.866666667,0.866666667, 0.866666667, 0.866666667, 0.866666667, 0.866666667,0.866666667, 0.54, 0.54, 0.54), BrtIncSuit = c(0.8, 0.8, 0.8, 0.8, 0.8,0.8, 0.8, 0.43, 0.43, 0.43), BrtGrSuit = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8,0.8, 0.86, 0.86, 0.86), CcfSpSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CcfIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),CcfGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfSpSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), RbtSpSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), RbtIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), RbtGrSuit = c(0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.925, 0.925, 0.925), SmbSpSuit = c(0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.675, 0.675,0.675), SmbIncSuit = c(0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.766666667,0.766666667, 0.766666667), SmbGrSuit = c(0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.0875, 0.0875, 0.0875), StbSpSuit = c(0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425), StbIncSuit = c(0, 0,0, 0, 0, 0, 0, 0, 0, 0), StbGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), HbcSpSuit = c(0, 0, 0, 0, 0, 0, 0,0, 0, 0), HbcIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), HbcGrSuit = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.425, 0.425, 0.425)), .Names = c("Month", "SpdSpSuit", "SpdIncSuit", "SpdGrowSuit","RzbSpSuit", "RzbIncSuit", "RzbGrowSuit", "FMSSpSuit", "FMSIncSuit","FMSGrowSuit", "BhsSpSuit", "BhsIncSuit", "BhsGrowSuit", "BrtSpSuit","BrtIncSuit", "BrtGrSuit", "CcfSpSuit", "CcfIncSuit", "CcfGrSuit","GsfSpSuit", "GsfIncSuit", "GsfGrSuit", "RbtSpSuit", "RbtIncSuit","RbtGrSuit", "SmbSpSuit", "SmbIncSuit", "SmbGrSuit", "StbSpSuit","StbIncSuit", "StbGrSuit", "HbcSpSuit", "HbcIncSuit", "HbcGrSuit"), class = c("data.table", "data.frame"), row.names = c(NA, -10L))) 

AltSuitDates <- data.table(structure(list(Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 10L, 11L,12L), SpdSpT = c(NA, NA, NA, NA, NA, 1L, 1L, NA, NA, NA), SpdIncT = c(NA,NA, NA, NA, NA, 1L, 1L, NA, NA, NA), SpdGrT = c(1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L), RzbSpT = c(NA, NA, NA, 1L, 1L, 1L, NA,NA, NA, NA), RzbIncT = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA,NA), RzbGrT = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), FmsSpT = c(NA,NA, 1L, 1L, NA, NA, NA, NA, NA, NA), FmsIncT = c(NA, NA, 1L,1L, 1L, NA, NA, NA, NA, NA), FMSGrT = c(1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L), BhsSpT = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA,NA), BhsIncT = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA, NA), BhsGrT = c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BRTsp = c(1L, 1L, 1L, NA,NA, NA, NA, 1L, 1L, 1L), BRTinc = c(1L, 1L, 1L, 1L, NA, NA, NA,1L, 1L, 1L), BRTgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), CCFsp = c(NA, NA, NA, NA, 1L, 1L, 1L, NA, NA, NA), CCFinc = c(NA,NA, NA, NA, 1L, 1L, 1L, NA, NA, NA), CCFgr = c(1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L), GSFsp = c(NA, NA, NA, NA, 1L, 1L, 1L,NA, NA, NA), GSFinc = c(NA, NA, NA, NA, 1L, 1L, 1L, NA, NA, NA), GSFgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), RBTsp = c(1L,1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA), RBTinc = c(1L, 1L, 1L, 1L,1L, 1L, 1L, NA, NA, NA), RBTgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L), SMBsp = c(NA, NA, NA, 1L, 1L, 1L, 1L, NA, NA, NA), SMBinc = c(NA, NA, NA, 1L, 1L, 1L, 1L, NA, NA, NA), SMBgr = c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), STBsp = c(NA, NA, NA, NA,NA, 1L, 1L, NA, NA, NA), STBinc = c(NA, NA, NA, NA, NA, 1L, 1L,NA, NA, NA), STBgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), HBCsp = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA, NA), HBCinc = c(NA,NA, NA, 1L, 1L, 1L, NA, NA, NA, NA), HBCgr = c(1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Month", "SpdSpT", "SpdIncT","SpdGrT", "RzbSpT", "RzbIncT", "RzbGrT", "FmsSpT", "FmsIncT","FMSGrT", "BhsSpT", "BhsIncT", "BhsGrT", "BRTsp", "BRTinc", "BRTgr","CCFsp", "CCFinc", "CCFgr", "GSFsp", "GSFinc", "GSFgr", "RBTsp","RBTinc", "RBTgr", "SMBsp", "SMBinc", "SMBgr", "STBsp", "STBinc","STBgr", "HBCsp", "HBCinc", "HBCgr"), class = c("data.table","data.frame"), row.names = c(NA, -10L)))

Where SpId is a species identifier. One DT is 5+ million rows long (AltSuitSp1), the other is 12 (AltSuitDates). I am using the DT that is 12 rows long (corresponding to 12 months) to update the larger DT. Currently I am using a nested if, else if structure in a for loop to check a condition and update the larger DT based off the smaller DT (see code below)

h <- 1
n <- length(AltSuitSp1[,Month])
stm <- AltSuitSp1[,Month]  #  AltSuitSp1 is the 5+ million row DT

cond1 <- which(stm == 1)  #  list of all rows of AltSuitSp1 where the Month is = 1
cond2 <- which(stm == 2)  #  list of all rows of AltSuitSp1 where the Month is = 2
...
cond12 <- which(stm == 12)

for (h in seq(n)){
    if (any(cond1 == h)){
        set(AltSuitSp1,h,2:34,(AltSuitSp1[h,2:34,with=F] * AltSuitDates[1,2:34,with=F]))
    }else if (any(cond2 == h)){
        set(AltSuitSp1,h,2:34,(AltSuitSp1[h,2:34,with=F] * AltSuitDates[2,2:34,with=F]))
    }else if ...
    }else if (any(cond12)){
        set(AltSuitSp1,h,2:34,(AltSuitSp1[h,2:34,with=F] * AltSuitDates[12,2:34,with=F]))
    }else
        break
}

Now, I have run this code for 1 minute and checked to see how far h had advanced. Currently I am seeing about 29-30 loops a second and h has advanced to about 1800 iterations. However, even at 30 loops per second (which is rather slow: Using Set in DT), this code will take approximately 2 days to finish. However, as the output below shows, it is doing what I want/expect it to do.

AltSuitSp1Results <- data.table(structure(list(Month = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,12L, 12L, 12L), .Label = c("1", "10", "11", "12", "2", "3", "4","5", "6", "7", "8", "9"), class = "factor"), SpdSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0, 0, 0), SpdIncSuit = c(NA, NA, NA,NA, NA, NA, NA, 0, 0, 0), SpdGrowSuit = c(0, 0, 0, 0, 0, 0, 0,0.4625, 0.4625, 0.4625), RzbSpSuit = c(NA, NA, NA, NA, NA, NA,NA, 0.283333333, 0.283333333, 0.283333333), RzbIncSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.34, 0.34, 0.34), RzbGrowSuit = c(0,0, 0, 0, 0, 0, 0, 0.283333333, 0.283333333, 0.283333333), FMSSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.34, 0.34, 0.34), FMSIncSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.425, 0.425, 0.425), FMSGrowSuit = c(0,0, 0, 0, 0, 0, 0, 0.283333333, 0.283333333, 0.283333333), BhsSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.233333333, 0.233333333, 0.233333333), BhsIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0, 0), BhsGrowSuit = c(0,0, 0, 0, 0, 0, 0, 0.233333333, 0.233333333, 0.233333333), BrtSpSuit = c(0.866666667,0.866666667, 0.866666667, 0.866666667, 0.866666667, 0.866666667,0.866666667, 0, 0, 0), BrtIncSuit = c(0.8, 0.8, 0.8, 0.8, 0.8,0.8, 0.8, 0.43, 0.43, 0.43), BrtGrSuit = c(0, 0, 0, 0, 0, 0,0, 0.86, 0.86, 0.86), CcfSpSuit = c(NA, NA, NA, NA, NA, NA, NA,0, 0, 0), CcfIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0, 0),CcfGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0, 0, 0), GsfIncSuit = c(NA, NA,NA, NA, NA, NA, NA, 0, 0, 0), GsfGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), RbtSpSuit = c(NA, NA, NA, NA, NA, NA, NA,0, 0, 0), RbtIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0,0), RbtGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0.925, 0.925, 0.925), SmbSpSuit = c(NA, NA, NA, NA, NA, NA, NA, 0.675, 0.675,0.675), SmbIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0.766666667,0.766666667, 0.766666667), SmbGrSuit = c(0, 0, 0, 0, 0, 0,0, 0.0875, 0.0875, 0.0875), StbSpSuit = c(NA, NA, NA, NA,NA, NA, NA, 0.425, 0.425, 0.425), StbIncSuit = c(NA, NA,NA, NA, NA, NA, NA, 0, 0, 0), StbGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), HbcSpSuit = c(NA, NA, NA, NA, NA, NA, NA,0, 0, 0), HbcIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0,0), HbcGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0.425, 0.425, 0.425)), .Names = c("Month", "SpdSpSuit", "SpdIncSuit", "SpdGrowSuit","RzbSpSuit", "RzbIncSuit", "RzbGrowSuit", "FMSSpSuit", "FMSIncSuit","FMSGrowSuit", "BhsSpSuit", "BhsIncSuit", "BhsGrowSuit", "BrtSpSuit","BrtIncSuit", "BrtGrSuit", "CcfSpSuit", "CcfIncSuit", "CcfGrSuit","GsfSpSuit", "GsfIncSuit", "GsfGrSuit", "RbtSpSuit", "RbtIncSuit","RbtGrSuit", "SmbSpSuit", "SmbIncSuit", "SmbGrSuit", "StbSpSuit","StbIncSuit", "StbGrSuit", "HbcSpSuit", "HbcIncSuit", "HbcGrSuit"), class = c("data.table", "data.frame"), row.names = c(NA, -10L)))

Obviously, I am not going about this in an efficient manner and am doing some sloppy programming. However, I am struggling to figure out where exactly I can optimize my code. Am I trying to reinvent a built-in DT function DT? Am I in one of the Circles; Have I missed a place where I can vectorize: R Inferno?

Basically, I need to update columns 2:34 in the AltSuitSp1 DT, based off of the AltSuitDates DT, using the Month column as the condition to know which row to use from AltSuitDates DT to update AltSuitSp1. Any help is appreciated.

Upvotes: 0

Views: 206

Answers (3)

jlhoward
jlhoward

Reputation: 59355

Based in @BrodieG's comment, this has been reworked to accommodate 34 columns in SP2. On a dataset of your size (5e6 rows, 34 columns) it runs in about 4 min.

The basic approach is to append the reference columns (called Dates here), to SP1, one at a time. Then update the appropriate column of SP1, then repeat with the next column. This is fairly efficient in terms of memory (only 1 extra column at any point), and still takes advantage of data.table's set-by-reference.

library(data.table)
set.seed(1)
ncol <- 34
nrow <- 5e6
m    <- matrix(sample(10000:99999,nrow*ncol,replace=T),ncol=ncol)
SP1  <- data.table(Month=sample(1:12,nrow(m),replace=T),m)
m    <- matrix(sample(1:12,12*ncol,replace=T),ncol=ncol)
SP2  <- data.table(Month=sample(1:12,12),m)
cols <- paste0("SpId",(1:ncol(m)))
setnames(SP1,2:(ncol(m)+1),cols)
setnames(SP2,2:(ncol(m)+1),cols)

system.time({
  setkey(SP1,"Month")
  setkey(SP2,"Month")
  lapply(1:ncol,function(i){
    setnames(SP2,cols[i],"Dates")  # don't want colname collision in merge
    SP1[SP2[,c("Month","Dates")],Dates:=Dates]
    SP1[, cols[i] := lapply(.SD, `*`, Dates), .SDcols = cols[i]]
    setnames(SP2,"Dates",cols[i])  # set it back so next iteration works
  })
})
#    user  system elapsed 
#  219.54   22.45  242.59 

Upvotes: 0

BrodieG
BrodieG

Reputation: 52637

EDIT, updated to run with the data that was posted.

This should work:

AltSuitSp1$Month <- as.integer(levels(AltSuitSp1$Month))[AltSuitSp1$Month]
setkey(AltSuitDates, Month)
d.cols <- ncol(AltSuitDates) - 1L
AltSuitDates[AltSuitSp1, ][, 
  c(list(Month=Month), 
   mapply(
     `*`, 
     .SD[, 2:(d.cols + 1)],
     .SD[, (d.cols + 2):(2 * d.cols + 1)],
     SIMPLIFY=FALSE
) ) ]

Basically, you start by joining the two tables by Month (third line), then you use .SD, which is a special object that refers to the data table itself, to pass the first set of rows from AltSuitSp1 and the second set from AltSuitDates (these are now all in the same table) to mapply so that it can multiply them together. Here is the data I used:

library(data.table)
set.seed(1)
AltSuitSp1 <- do.call(rbind, replicate(3, data.table(Month=1:12, a=runif(12), b=runif(12), c=runif(12)), s=F))
AltSuitDates <- data.table(Month=1:12, a=runif(12, 5, 10), b=runif(12, 5, 10), c=runif(12, 5, 10))

Upvotes: 1

Dean MacGregor
Dean MacGregor

Reputation: 18426

Try this. I think it might be similar to BrodieG's but when I copy-pasted BrodieG's answer it didn't work for me and I don't know how to read through the mapply part to figure out what I did wrong....

comb<-merge(AltSuitSp1,AltSuitDates,by="Month")
sp<-colnames(AltSuitSp1)[2:NCOL(AltSuitSp1)]
dat<-colnames(AltSuitDates)[2:NCOL(AltSuitDates)]
comb[,eval(parse(text=paste0("list(Month,",paste0(sp,"=",sp,"*",dat,collapse=","),")")))]

I know the ol' eval(parse(text= isn't generally considered to be good practice but when all you have is a hammer it makes the screws go in the wood.

Upvotes: 0

Related Questions