How to optimize this for loop in R for a large data.table

Question

I am working on a large data.table (2.5 million rows) of interbank loans. Here is an extract of the first 20:

> dput(head(clean,20))
structure(list(time = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 
4L, 4L, 1L, 2L, 3L, 4L, 3L, 4L, 4L, 4L), bal = structure(c(2L, 
4L, 4L, 4L, 4L, 4L, 3L, 3L, 9L, 4L, 2L, 3L, 3L, 3L, 3L, 2L, 4L, 
5L, 2L, 15L), .Label = c("32001", "32002", "32003", "32004", 
"32005", "32006", "32007", "32008", "32009", "32010", "32201", 
"32202", "32203", "32204", "32205", "32206", "32207", "32208", 
"32209", "32210"), class = "factor"), lender = c(2003L, 2547L, 
2547L, 574L, 574L, 574L, 2984L, 3015L, 812L, 3278L, 3124L, 3124L, 
41L, 354L, 3156L, 3156L, 735L, 735L, 1421L, 3319L), borrower = c(2285L, 
2285L, 2285L, 2285L, 2285L, 2285L, 2285L, 2285L, 269L, 2839L, 
2839L, 2839L, 2839L, 2897L, 2399L, 2399L, 1816L, 1816L, 2476L, 
3033L), obm = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0), obd = c(1, 0.3, 0.2, 0.35, 0.7, 0.5, 0.4, 1.2, 
4, 0.16, 4, 4, 0.5, 0.1, 1.4, 1.4, 4, 1, 3.25, 0.4), obk = c(1, 
0, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 4, 0.5, 0.1, 0, 0, 0, 0, 3.25, 
0), oem = c(0, 0.3, 0.2, 0.35, 0.7, 0.5, 0.4, 0.7, 4, 0.16, 4, 
0, 0, 0, 1.4, 1.4, 4, 1, 0, 0.4), r = c(35, 63, 63, 63, 63, 63, 
60, 60, 3, 55, 25, 12, 34, 0, 5, 4, 60, 60, 60, 35), type = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L), .Label = c("loan", "deposit"), class = "factor"), 
    term = structure(c(2L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 9L, 4L, 
    2L, 3L, 3L, 3L, 3L, 2L, 4L, 5L, 2L, 5L), .Label = c("overdraft", 
    "<1d", "2-7d", "8-30d", "31-90d", "91-180d", "0.5-1y", "1-3y", 
    ">3y", "demand"), class = "factor"), reported = structure(c(10561, 
    10561, 10561, 10561, 10561, 10561, 10561, 10561, 10531, 10561, 
    10561, 10561, 10470, 10500, 10531, 10561, 10531, 10561, 10561, 
    10561), class = "Date"), issued = structure(c(10542, 10543.5, 
    10550, 10556.5, 10553.5, 10555.5, 10558, 10558, 10515, 10557.5, 
    10560, 10555, 10465, 10488, 10527, 10560, 10515.5, 10545.5, 
    10541, 10544), class = "Date"), issued_radius = c(0, 10.5, 
    10, 3.5, 6.5, 4.5, 2, 2, 15, 2.5, 0, 2, 2, 2, 2, 0, 10.5, 
    14.5, 0, 13), due = structure(c(10543, 10563, 10570, 10583, 
    10577, 10581, 10563, 10563, 11966, 10585, 10561, 10560, 10470, 
    10493, 10532, 10561, 10535, 10611, 10542, 10589), class = "Date"), 
    month = c(4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 1, 2, 3, 4, 
    3, 4, 4, 4), week = c(14, 14, 15, 16, 16, 16, 17, 17, 10, 
    16, 17, 16, 3, 7, 12, 17, 10, 15, 14, 15)), .Names = c("time", 
"bal", "lender", "borrower", "obm", "obd", "obk", "oem", "r", 
"type", "term", "reported", "issued", "issued_radius", "due", 
"month", "week"), class = c("data.table", "data.frame"), row.names = c(NA, 
-20L), .internal.selfref = )

The three columns of interest in clean are issued, issued_radius and week, but I have included all columns since they affect the loop's performance.

Every row stands for a loan that I want to estimate the Date of issuance for, with weekly resolution. This issuance date lies in the interval [issued - issued_radius, issued + issued_radius]. This interval could span 1 day, or maybe several weeks (with a maximum of one month, or a maximum of 5 weeks). The code generates this interval and counts how many weeks, from a offset date, are included in the interval. Each of these weeks is assigned a weight consistent with the overlap. For example, one loan in clean that could be issued in week 17 and week 18, as derived from the interval, gets expanded into two loans in patch and the loan volume (columns oem, obd, etc.) are scaled with this weight.

library(data.table)

START_DATE = as.Date("1998-8-1")

elapsed_weeks <- function(t, start_date) {
  as.numeric( floor( difftime( t, start_date, units="weeks" ) ) )
}

#load("clean.Rda")

# One-day intervals can be added to our result immediately
patch = clean[issued_radius==0]
clean = clean[issued_radius!=0]

N = nrow(clean)
write_index = nrow(patch)+1

# Allocate space in patch.
dummy = data.table(time = rep(0, N*5))
patch = rbindlist(list(patch, dummy), use.names = TRUE, fill= TRUE)

for (k in 1:N) {
  entry = clean[k]

  # Recover Date interval [i, j].
  i = entry$issued - entry$issued_radius
  j = entry$issued + entry$issued_radius

  # Generate sequence of days in the interval and
  # map each day to a weeknumber, counting the frequencies.
  x = seq.Date(i, j, by="day")
  T = table(elapsed_weeks(x, START_DATE))

  for (name in names(T)) { # can this be vectorized?
    week_number = as.numeric(name)
    week_weight = as.numeric(T[name]) / length(x)

    new_entry = entry

    new_entry$week = week_number
    new_entry$obm = entry$obm * week_weight
    new_entry$obd = entry$obd * week_weight
    new_entry$obk = entry$obk * week_weight
    new_entry$oem = entry$oem * week_weight

    patch[write_index] = new_entry

    write_index = write_index + 1
  }
}

# Delete unused allocated rows.
patch = patch[!is.na(type)]

print(nrow(patch)/nrow(clean)) # < 5

edit 2: Adding another example.

> clean[2]
   time   bal lender borrower obm obd obk oem  r type  term   reported     issued issued_radius        due
1:    4 32004   2547     2285   0 0.3   0 0.3 63 loan 8-30d 1998-12-01 1998-11-13          10.5 1998-12-03
   month week
1:     4   14

For this loan, it could be issued on any day in [1998-11-3,1998-11-24]. Every day in this interval is mapped to a the number of weeks it is offset from START_DATE:

> x
 [1] "1998-11-03" "1998-11-04" "1998-11-05" "1998-11-06" "1998-11-07" "1998-11-08" "1998-11-09" "1998-11-10"
 [9] "1998-11-11" "1998-11-12" "1998-11-13" "1998-11-14" "1998-11-15" "1998-11-16" "1998-11-17" "1998-11-18"
[17] "1998-11-19" "1998-11-20" "1998-11-21" "1998-11-22" "1998-11-23" "1998-11-24"
> elapsed_weeks(x, START_DATE)
 [1] 13 13 13 13 14 14 14 14 14 14 14 15 15 15 15 15 15 15 16 16 16 16

Now we make a frequency table to deduce the weight for each possible week of issuance for the loan.

> table(elapsed_weeks(x, START_DATE))

13 14 15 16 
 4  7  7  4

So this loan will be expanded into for loans with week columns {13, 14, 15, 16}. The volume's of this loans are scaled with the frequency weights of the set of possible weekly offsets.

> table(elapsed_weeks(x, START_DATE)) / length(x)

       13        14        15        16 
0.1818182 0.3181818 0.3181818 0.1818182

Thus we end up with patch looking like this:

> patch
   time   bal lender borrower obm        obd obk        oem  r type  term   reported     issued
1:    4 32004   2547     2285   0 0.05454545   0 0.05454545 63 loan 8-30d 1998-12-01 1998-11-13
2:    4 32004   2547     2285   0 0.09545455   0 0.09545455 63 loan 8-30d 1998-12-01 1998-11-13
3:    4 32004   2547     2285   0 0.09545455   0 0.09545455 63 loan 8-30d 1998-12-01 1998-11-13
4:    4 32004   2547     2285   0 0.05454545   0 0.05454545 63 loan 8-30d 1998-12-01 1998-11-13
   issued_radius        due month week
1:          10.5 1998-12-03     4   13
2:          10.5 1998-12-03     4   14
3:          10.5 1998-12-03     4   15
4:          10.5 1998-12-03     4   16

I already did some optimization thanks to @David (How to speed up rbind?), but the result is still very slow. After ten hours of nightly computation I had processed 4% of the clean data table.

So my question is: how can I scale this loop to a large data.table?

Thank you all for your time.

edit: R version 3.3.1 (2016-06-21).

Roland · Accepted Answer

If I've understood your explanation correctly, you should use an overlaps join in data.table.

#define start and end dates, 
#fractional days could be an issue here, but I have not checked that further
DT[, c("start", "end") := .(issued - issued_radius, issued + issued_radius)]
#create an ID
DT[, id := .I]

#create a data.table with start of week and end of week for whole year
weeks <- data.table(date = seq(as.Date("1998-01-01"), as.Date("1998-12-31"), by = "1 day"))
weeks[, week := week(date)]
weeks <- weeks[, .(start = min(date), end = max(date)), by = week]
setkey(weeks, start, end)

#now an overlaps join
DT1 <- foverlaps(DT, weeks)
#calculate number of days in each week, 
#special handling of last and first week of year might be necessary here
DT1[, overlap := 7 - (i.start > start) * (i.start - start) -  (i.end < end) * (end - i.end)]
#calculate weights
DT1[, weight := as.numeric(overlap) / sum(as.numeric(overlap)), by = id]
#apply weights
DT1[, c("obm_w",  "obd_w",  "obk_w",  "oem_w") := lapply(.SD, function(x) x * DT1[["weight"]]), 
    .SDcols = c("obm",  "obd",  "obk",  "oem")]

Please check carefully if this does what you need and adjust as necessary.

How to optimize this for loop in R for a large data.table

Answers (1)

Related Questions