More performant use of fasttime

Question

I have a large data table, (1 billion rows × 50 columns) similar to flights from library(nycflights13), where multiple columns can be combined to form a date.

The code I am currently using to create this date-time column is:

library(data.table)
library(nycflights13)
library(fasttime)

flights <- as.data.table(flights)

flights[,DepDateTime := fastPOSIXct(paste0(year, 
                                           "-",
                                           formatC(month, width = 2, format = "d", flag = "0"),
                                           "-",
                                           formatC(day, width = 2, format = "d", flag = "0"), 
                                           " ",
                                           # replace e.g. 903 with '09:03:00'
                                           gsub("([0-9]{2})([0-9]{2})", "\1:\2:00", 
                                                formatC(dep_time, width = 4, 
                                                        format = "d", flag = "0")))
                                           )]

For the flights data, this takes around 0.6 s. Is there any way to improve this performance? I am interested in timing primarily; memory usage is a secondary concern.

Here is a candidate data table:

flights.big <- 
data.table(year = sample(1980:2015, size = 1e9, replace = TRUE),
           month = sample(1:12, size = 1e9, replace = TRUE), 
           day = sample(1:28, size = 1e9, replace = TRUE),
           hour = sample(1:12, size = 1e9, replace = TRUE),
           minute = sample(0:59, size = 1e9, replace = TRUE)
           )

Hugh · Accepted Answer

A considerable speed increase was achieve by using joins and sprintf in a function (create_fn). The increase is more modest for the smaller dataset:

library(data.table)
library(nycflights13)
library(fasttime)
library(microbenchmark)
library(ggplot2) # for autoplot

create_DepDateTime <- function(DT){
  setkey(DT, year, month, day, dep_time)
  unique_dates <- unique(DT[,list(year, month, day, dep_time)])
  unique_dates[,DepDateTime := fastPOSIXct(sprintf("%d-%02d-%02d %s", year, 
                                                   month, 
                                                   day, 
                                                   sub("([0-9]{2})([0-9]{2})", 
                                                       "\1:\2:00",
                                                       sprintf("%04d", dep_time), 
                                                       perl = TRUE)), 
                                           tz = "GMT")]
  DT[unique_dates]
}

flights <- as.data.table(flights)

BENCHMARK <- function(){
  flights[,DepDateTime := fastPOSIXct(paste0(year, 
                                             "-",
                                             formatC(month, width = 2, 
                                                     format = "d", flag = "0"),
                                             "-",
                                             formatC(day, width = 2, 
                                                     format = "d", flag = "0"), 
                                             " ",
                                             # replace e.g. 903 with '09:03:00'
                                             gsub("([0-9]{2})([0-9]{2})", 
                                                  "\1:\2:00", 
                                                  formatC(dep_time, 
                                                          width = 4, 
                                                          format = "d", 
                                                          flag = "0")))
  )]
}

NGaffney_lubridate <- function(){
  flights[,DepDateTime := lubridate::ymd_hm(paste(year, 
                                                  month, 
                                                  day, 
                                                  stringr::str_pad(dep_time,
                                                                   width = 4,
                                                                   side = "left",
                                                                   pad = "0"), 
                                                  sep = "-"))]
}
create_fn <- function(){
  flights <- create_DepDateTime(flights)
}

autoplot(
  microbenchmark(
  BENCHMARK(),
  NGaffney_lubridate(),
  create_fn(),
  times=50L
  )
)

More performant use of fasttime

Answers (2)

Related Questions