Reputation: 729
I have a requirement, where in I have a huge database of around 2 million records where in I need to create new variables with codes based on information from another data frame for some specific variables. So the situation is -
So for example for an IBD 5 and variable var1a consider below information in inter1 file -
IBD var1a
5 11
5 18
5 30
5 63
Based on above information I want to create a new variable in smpl data frame such that -
if smpl$var1a <= 11 then var1a_INT = 1
if smpl$var1a > 11 & smpl$var1a <= 18 then var1a_INT = 2
if smpl$var1a > 18 & smpl$var1a <= 30 then var1a_INT = 3
if smpl$var1a > 30 & smpl$var1a <= 63 then var1a_INT = 4
if smpl$var1a > 63 then var1a_INT = 5
Since this needs to be done for multiple variables and by IBD, I have written my code using for loop. My sample code is as below -
set.seed(1200)
IBD <- sort(rep(1:10,4))
var1a <- c()
var2a <- c()
var3a <- c()
var4a <- c()
var5a <- c()
j=10
for (i in 1:10){
set.seed(1200)+(j*i)
var1 <- sort(sample(1:(10*i),4))
var2 <- sort(sample(11:(15*i),4))
var3 <- sort(sample(10:(17*i),4))
var4 <- sort(sample(11:(19*i),4))
var5 <- sort(sample(10:(16*i),4))
var1a <- c(var1a,var1)
var2a <- c(var2a,var2)
var3a <- c(var3a,var3)
var4a <- c(var4a,var4)
var5a <- c(var5a,var5)
}
inter1 <- data.frame(IBD,var1a,var2a,var3a,var4a,var5a)
sm=5000
ID <- seq(1:sm)
IBD <- sample(1:10,sm,replace = T)
CELL <- sample(1001:9999,sm)
var1a <- sample(1:150,sm,replace = T)
var2a <- sample(1:200,sm,replace = T)
var3a <- sample(1:200,sm,replace = T)
var4a <- sample(1:350,sm,replace = T)
var5a <- sample(1:250,sm,replace = T)
var6a <- sample(1:150,sm,replace = T)
var7a <- sample(1:250,sm,replace = T)
var8a <- sample(1:350,sm,replace = T)
var9a <- sample(1:450,sm,replace = T)
loc <- sample(1:20,sm,replace = T)
bill <- sample(1:2,sm,replace = T)
smpl <- data.frame(ID,IBD,CELL,var1a,var2a,var3a,var4a,var5a,var6a,var7a,var8a,var9a,loc,bill)
v0int <- c("var1a","var2a","var3a","var4a","var5a")
df_smpl <- data.frame(matrix(NA,nrow = 0,ncol = ncol(smpl)))
#l=1
start_time <- Sys.time()
for (l in (unique(inter1$IBD))){
df1 <- subset(smpl,IBD == l)
for (k in 1:length(v0int)){
#k=1
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l,q0])
for (m in 1:nrow(df1)){
#print(q0)
#print(l)
#print(m)
if (length(q1) == 0){
df1[m,paste0(q0,"_INT")]=NA
} else if(length(q1) == 1){
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1]) df1[m,paste0(q0,"_INT")]=2
} else if(length(q1) == 2){
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1] & df1[m,q0] <= q1[2]) df1[m,paste0(q0,"_INT")]=2
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[2]) df1[m,paste0(q0,"_INT")]=3
} else if(length(q1) == 3) {
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1] & df1[m,q0] <= q1[2]) df1[m,paste0(q0,"_INT")]=2
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[2] & df1[m,q0] <= q1[3]) df1[m,paste0(q0,"_INT")]=3
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[3]) df1[m,paste0(q0,"_INT")]=4
} else if(length(q1) == 4) {
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1] & df1[m,q0] <= q1[2]) df1[m,paste0(q0,"_INT")]=2
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[2] & df1[m,q0] <= q1[3]) df1[m,paste0(q0,"_INT")]=3
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[3] & df1[m,q0] <= q1[4]) df1[m,paste0(q0,"_INT")]=4
if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[4]) df1[m,paste0(q0,"_INT")]=5
}
}
#q1 <- NULL
}
df_smpl <- rbind(df_smpl,df1)
#q0 <- NULL
}
time_taken <- as.numeric(difftime(Sys.time(), start_time, units = 'secs'))
For sample data of 5000 records this takes 5.859623 seconds on my machine which is having 16GB RAM SSD HDD with 2 cores.
When tried for a data with 500000 records this takes 752.7261 seconds.
My actual data is having 2 million records and I need to run this in a iterative manner multiple times so the time needed would increase in a big way.
On doing some search I understand data.table is much faster and saves huge amount of time. I do not know data.table very well and want to seek your help on this.
It would be a huge help and huge time saving if we can optimize this code.
Upvotes: 3
Views: 57
Reputation: 42564
There are two alternative approaches, rolling join and update in a non-equi join. Both are four to five times faster for the given sample dataset and less memory consuming than minem's solution.
It requires to create start
- end
intervals which is best done in long format
# create intervals in long format
long <- setDT(melt(inter1, "IBD", variable.name = "var"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var,
value = c(-.Machine$integer.max, .Machine$integer.max),
unique = TRUE)])[
order(IBD, var, value)]
long <- long[, .(start = head(value, -1L),
end = tail(value, -1L),
INT = 1:(.N - 1L)),
by = .(IBD, var)]
long
IBD var start end INT 1: 1 var1a -2147483647 2 1 2: 1 var1a 2 4 2 3: 1 var1a 4 8 3 4: 1 var1a 8 9 4 5: 1 var1a 9 2147483647 5 --- 246: 10 var5a -2147483647 29 1 247: 10 var5a 29 44 2 248: 10 var5a 44 45 3 249: 10 var5a 45 80 4 250: 10 var5a 80 2147483647 5
Note that the largest integer was used instead of Inf
in order to avoid coersion from integer to double.
Now, we loop over the specified columns and do the non-equi join on each column. Each iteration adds a new result column:
v0int <- c("var1a","var2a","var3a","var4a","var5a")
setDT(smpl)
for (col in v0int) {
smpl[long[var == col],
on = c("IBD", paste0(col, ">start"), paste0(col, "<=end")),
paste0(col, "_INT") := i.INT]
}
smpl[]
ID IBD CELL var1a var2a var3a var4a var5a var6a var7a var8a var9a loc bill var1a_INT var2a_INT var3a_INT var4a_INT var5a_INT 1: 1 7 6849 93 38 151 203 63 70 35 8 7 17 2 5 1 5 5 4 2: 2 9 2517 109 130 97 296 15 97 79 267 422 4 2 5 5 1 5 1 3: 3 10 9322 65 18 160 156 80 132 33 41 387 8 1 5 1 5 4 4 4: 4 10 7377 105 8 87 263 101 110 207 224 331 11 2 5 1 1 5 5 5: 5 4 6991 72 144 187 144 117 125 123 84 60 3 1 5 5 5 5 5 --- 4996: 4996 6 5129 56 188 21 74 105 133 192 45 284 5 1 5 5 1 3 5 4997: 4997 2 2657 8 50 127 6 119 81 60 250 209 3 2 2 5 5 1 5 4998: 4998 2 1473 128 90 156 74 203 5 198 63 10 17 1 5 5 5 5 5 4999: 4999 9 2120 66 141 170 256 151 68 205 97 8 9 2 5 5 5 5 5 5000: 5000 2 4555 109 102 92 98 11 107 104 210 266 14 2 5 5 5 5 1
Note that the join conditions (on =
) are created dynamically as character strings.
Frank has pointed out that a rolling join would also be applicable here as there are no gaps in the intervals.
The OP has specified right closed intervals, e.g.,
if smpl$var1a > 11 & smpl$var1a <= 18 then var1a_INT = 2
Therefore, we need a backward rolling join which uses the end
values of the intervals.
In a regular join, the join parameters must match exactly. In a backward rolling join, if there is no exact match so that the value falls in the gap between two end
values, then the next observation is carried backwards (NOCB).
long <- setDT(melt(inter1, "IBD", variable.name = "var", value.name = "end"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var, end = .Machine$integer.max,
unique = TRUE)])
setorder(long, IBD, var, end)
long[, INT := rowid(IBD, var)]
v0int <- c("var1a","var2a","var3a","var4a","var5a")
setDT(smpl)
for (col in v0int) {
smpl[, paste0(col, "_INT") := long[var == col][
smpl, on = c("IBD", paste0("end==", col)),
roll = -Inf, x.INT]]
}
The non-equi join and the rolling join is compared against the two fastest variants of minem's answer, the ones which update smpl
by reference to avoid repeated calls to rbind()
.
The results are equal except for a different order of rows.
As all solutions update smpl
by reference, all benchmark runs start with a fresh copy()
of the original dataset.
library(bench)
my_check <- function(x, y) {
all.equal(x[order(ID)], y[order(ID)])
}
v0int <- c("var1a","var2a","var3a","var4a","var5a")
bm <- mark(
rj = {
smpl <- copy(smpl0)
long <- setDT(melt(inter1, "IBD", variable.name = "var", value.name = "end"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var, end = .Machine$integer.max,
unique = TRUE)])
setorder(long, IBD, var, end)
long[, INT := rowid(IBD, var)]
setDT(smpl)
for (col in v0int) {
smpl[, paste0(col, "_INT") := long[var == col][
smpl, on = c("IBD", paste0("end==", col)),
roll = -Inf, x.INT]]
}
smpl[]
},
nej = {
smpl <- copy(smpl0)
long <- setDT(melt(inter1, "IBD", variable.name = "var"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var,
value = c(-.Machine$integer.max, .Machine$integer.max),
unique = TRUE)])[
order(IBD, var, value)]
long <- long[, .(start = head(value, -1L),
end = tail(value, -1L),
INT = 1:(.N - 1L)),
by = .(IBD, var)]
setDT(smpl)
for (col in v0int) {
smpl[long[var == col],
on = c("IBD", paste0(col, ">start"), paste0(col, "<=end")),
paste0(col, "_INT") := i.INT]
}
smpl[]
},
minem1 = {
smpl <- copy(smpl0)
setDT(smpl) # convert smpl to data.table
setkey(smpl, IBD) # setkey on IBD for faster `IBD == l` operation
for (l in (unique(inter1$IBD))) {
for (k in 1:length(v0int)) {
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l, q0])
smpl[IBD == l, paste0(q0, "_INT") := as.integer(cut(get(q0), c(0, q1, Inf)))]
}
}
smpl[]
},
minem2 = {
smpl <- copy(smpl0)
setDT(smpl) # convert smpl to data.table
setkey(smpl, IBD) # setkey on IBD for faster `IBD == l` operation
for (l in (unique(inter1$IBD))) {
for (k in 1:length(v0int)) {
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l, q0])
smpl[IBD == l, paste0(q0, "_INT") := cut(get(q0), c(0, q1, Inf), labels = FALSE)]
}
}
smpl[]
},
check = my_check,
min_time = 1
)
bm
# A tibble: 4 x 14 expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc <chr> <bch:tm> <bch:tm> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <lis> 1 rj 20ms 22.5ms 22ms 28ms 44.5 1.73MB 3 41 921ms <data~ <Rpro~ <bch~ <tib~ 2 nej 25.3ms 28.3ms 28.6ms 30.9ms 35.4 2.31MB 2 20 566ms <data~ <Rpro~ <bch~ <tib~ 3 minem1 106.2ms 113.8ms 110.3ms 129.9ms 8.79 6.4MB 2 7 797ms <data~ <Rpro~ <bch~ <tib~ 4 minem2 98.8ms 101.8ms 101.6ms 106.3ms 9.83 5.66MB 3 7 712ms <data~ <Rpro~ <bch~ <tib~
The rolling join is about five times faster than minem's solutions, the non-equi join is four times faster. In addition, two to four times less memory is allocated.
ggplot2::autoplot(bm)
Upvotes: 2
Reputation: 3650
For your example data I got the same results using this loop:
for (l in (unique(inter1$IBD))){
df1 <- subset(smpl, IBD == l)
for (k in 1:length(v0int)){
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l,q0])
x <- as.integer(cut(df1[, q0], c(0, q1, Inf)))
df1[, paste0(q0,"_INT")] <- x
}
df_smpl <- rbind(df_smpl, df1)
}
0.42 sek vs 10 sek
Using data.table
we can easily add the results straight to the original data table. Which will be mush faster than using rbind
.
setDT(smpl) # convert smpl to data.table
setkey(smpl, IBD) # setkey on IBD for faster `IBD == l` operation
start_time <- Sys.time()
for (l in (unique(inter1$IBD))) {
for (k in 1:length(v0int)) {
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l, q0])
smpl[IBD == l, paste0(q0, "_INT") := as.integer(cut(get(q0), c(0, q1, Inf)))]
}
}
smpl # end result data.table
The major difference will be that the end result will have different row order than your original result.
Using this line it should be faster:
smpl[IBD == l, paste0(q0, "_INT") := cut(get(q0), c(0, q1, Inf), labels = F)]
Upvotes: 1