How to combine two factors to make filtering faster?

Question

I have a data.frame of 1e8 rows which has a column of results that I would like to filter by the following two columns: Model and subModel. I would like two figure out a how to join Model and subModel to make the filtering of "results" as fast as possible.

Example data

    df<-data.frame(results=runif(600),
Model=rep(c("BLO","LR","DG"),each=200), 
subModel=rep(rep(c("Fast","VeryFast","Slow","VerySlow"),each=5),3))

Filtering

simple<-function(df){
    df %>%
       filter(Model=="BLO" & subModel=="VeryFast")%>%
       select(results)
}

Combining Model and subModel

df1<-df %>%
mutate(CombinedModelSubModel=paste0(Model,".",subModel))

combined<-function(df1){
df1 %>%
filter(CombinedModelSubModel == "BLO.VeryFast")%>%
select(results)

}

Quick benchmark for only 600 rows:

> microbenchmark::microbenchmark(
+     simple(df),combined(df1),times=1000)
Unit: milliseconds
          expr      min       lq     mean   median       uq      max neval
    simple(df) 1.637144 2.099333 2.739295 2.414313 2.884119 13.09450  1000
 combined(df1) 1.600907 2.066368 2.771701 2.404103 2.976401 11.33212  1000

There's no real difference but is there a better option?

David · Accepted Answer

If you already have a database such as sqlite you could also use SQL.

You can use for example the RSQLite Package.

But if speed is your main goal, always create a benchmark, so here goes...

I am comparing base-R, dplyr, data.table, SQLite, and dbplyr (write dplyr code on your database).

If you want to have the performance of data.table but want to write dplyr code, checkout dtplyr.

Benchmark

library(tidyverse)
library(data.table)
library(microbenchmark)
library(DBI)
library(dbplyr)

small dataset

#################################################
### Small Data

# create data
set.seed(15412)
df <- data.frame(
  results = runif(600),
  Model = rep(c("BLO", "LR", "DG"), each = 200),
  subModel = rep(rep(c("Fast", "VeryFast", "Slow", "VerySlow"), each = 5), 3)
)
# create data copies in correct formats
dti <- as_tibble(df)
dt <- data.table(df)
db_con <- dbConnect(RSQLite::SQLite(), "small.sqlite")
dbWriteTable(db_con, "small", df, overwrite = TRUE)
tbl <- tbl(db_con, "small")

# microbenchmark the data
microbenchmark(
  base = {
    idx <- df$Model == "BLO" & df$subModel == "VeryFast"
    data.frame(results = df[idx, "results"])
  },
  dplyr = {
    dti %>%
      filter(Model == "BLO" & subModel == "VeryFast") %>%
      select(results)
  },
  data.table = {
    dt[Model == "BLO" & subModel == "VeryFast", .(results)]
  },
  SQLite = {
    dbGetQuery(db_con, "SELECT results FROM small WHERE Model = 'BLO' AND subModel = 'VeryFast'")
  },
  dbplyr = {
    tbl %>% 
      filter(Model == "BLO" & subModel == "VeryFast") %>%
      select(results) %>% 
      collect()
  }
)
# Unit: microseconds
# expr            min       lq      mean   median       uq      max neval cld
# base          117.0   180.85   318.689   203.55   292.35   2049.6   100  a 
# dplyr        1990.6  2505.20  4612.249  3036.30  5441.35  26954.4   100  a 
# data.table   1499.6  1811.95  4314.757  2059.70  3933.65 104310.4   100  a 
# SQLite        456.4   575.50  1105.427   745.60  1026.25   6610.4   100  a 
# dbplyr      17229.9 18604.15 32387.380 19980.60 36810.35 186945.5   100   b

We see for a small dataset, base can be fastest (which kinda suprised me, but well, here is the benchmark...), followed by sqlite.

larger dataset


###############################
### Larger Data

# create data = upsample smaller dataset
set.seed(1231)
N <- 1e7
idx <- sample.int(nrow(df), N, replace = TRUE)
df_large <- df[idx, ]

# create data copies in correct formats
dti_large <- as_tibble(df_large)
dt_large <- data.table(df_large)
db_con_large <- dbConnect(RSQLite::SQLite(), "large.sqlite")
dbWriteTable(db_con_large, "large", df_large, overwrite = TRUE)
tbl_large <- tbl(db_con_large, "large")

# start microbenchmark
microbenchmark(
  idx = {
    idx <- df_large$Model == "BLO" & df_large$subModel == "VeryFast"
    data.frame(results = df_large[idx, "results"])
  },
  dplyr = {
    dti_large %>%
      filter(Model == "BLO" & subModel == "VeryFast") %>%
      select(results)
  },
  data.table = {
    dt_large[Model == "BLO" & subModel == "VeryFast", .(results)]
  },
  SQLite = {
    dbGetQuery(db_con_large, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
  },
  dbplyr = {
    tbl_large %>% 
      filter(Model == "BLO" & subModel == "VeryFast") %>%
      select(results) %>% 
      collect()
  }
)

# Unit: milliseconds
# expr             min        lq       mean    median        uq       max neval  cld
# base        177.9852  205.2101  503.28675  247.8158  806.4475 2050.5940   100  b  
# dplyr       249.3802  272.4234  652.64045  608.5260  874.4070 3256.8231   100   c 
# data.table   13.8028   15.2065   43.57601   16.7459   21.1673  596.9686   100 a   
# SQLite     1004.3114 1085.9076 1256.08739 1153.1701 1295.7749 2944.5220   100    d
# dbplyr     1039.4028 1115.9282 1286.90739 1191.5628 1308.8002 4903.7834   100    d

Clear winner: data.table followed by base and dplyr.

Edit

To be fair, comparing having everything already within a data.table is probably not fair. This benchmark might be a bit fairer, as it includes the data loading time from SQL to R into the data.table timings:

## Fair comparison
microbenchmark(
  data.table_load = {
    dd <- dbGetQuery(db_con_large, "SELECT results, Model, subModel FROM large;")
    setDT(dd)
    dd[Model == "BLO" & subModel == "VeryFast", .(results)]
  },
  SQLite = {
    dbGetQuery(db_con_large, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
  }
)
# Unit: seconds
# expr            min       lq     mean   median       uq       max neval cld
# data.table 4.976999 5.640749 6.713015 6.471158 7.583160 10.595039   100   b
# SQLite     1.009312 1.059838 1.250528 1.128358 1.269739  3.188415   100  a

So depends on your needs (read data once, filter many times => data.table; read data once, filter once => SQL) you might want to choose SQL after all.

Edit 2 Duckdb, integer values, and Filesize

As promised, here a quick comparison of of using SQLite vs duckdb, as well as a comparison between character and integer lookups.

Setup Data

library(duckdb)

df_large_int <- df_large
df_large_int$Model <- factor(df_large_int$Model)
(model_levels <- levels(df_large_int$Model))
# [1] "BLO" "DG"  "LR"
summary(df_large_int$Model)
#     BLO      DG      LR 
# 3331088 3335538 3333374
df_large_int$Model <- as.integer(df_large_int$Model)
summary(df_large_int$Model)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#      1       1       2       2       3       3 

# replace subModel
df_large_int$subModel <- factor(df_large_int$subModel)
(sub_model_levels <- levels(df_large_int$subModel))
# [1] "Fast"     "Slow"     "VeryFast" "VerySlow"
df_large_int$subModel <- as.integer(df_large_int$subModel)

# write large_int to sqlite
db_con_large_int <- dbConnect(RSQLite::SQLite(), "large_int.sqlite")
dbWriteTable(db_con_large_int, "large_int", df_large_int, overwrite = TRUE)

# write large table to duckdb
db_con_duck <- dbConnect(duckdb::duckdb(), dbdir = "large.duckdb", readonly = FALSE)
dbWriteTable(db_con_duck, "large", df_large, overwrite = TRUE)

# write large int table to duckdb
db_con_duck_int <- dbConnect(duckdb::duckdb(), dbdir = "large_int.duckdb", readonly = FALSE)
dbWriteTable(db_con_duck_int, "large_int", df_large_int, overwrite = TRUE)

Actual Benchmark

microbenchmark(
  sqlite_char = {
    dbGetQuery(db_con_large, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
  },
  sqlite_int = {
    dbGetQuery(db_con_large_int, "SELECT results FROM large_int WHERE Model = 1 AND subModel = 3")
  },
  duckdb_char = {
    dbGetQuery(db_con_duck, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
  },
  duckdb_int = {
    dbGetQuery(db_con_duck_int, "SELECT results FROM large_int WHERE Model = 1 AND subModel = 3")
  }
)
# Unit: milliseconds
#        expr      min        lq      mean     median         uq       max neval  cld
# sqlite_char 947.4932 985.96300 1031.0347 1006.05580 1024.39820 1460.3388   100    d
# sqlite_int  722.4098 755.60875  804.6377  773.17705  806.51095 1290.4743   100   c 
# duckdb_char 485.6975 501.28400  531.7413  513.73495  529.36840 1119.0317   100  b  
# duckdb_int   44.4940  46.66485   70.5380   48.07785   54.06795  643.6255   100 a

Filesize

# Check Database sizes
dbDisconnect(db_con_large)
dbDisconnect(db_con_large_int)
dbDisconnect(db_con_duck)
dbDisconnect(db_con_duck_int)

# Check Filesizes
db_files <- list.files(pattern = "(sqlite|duckdb|duckdb.wal)$")
file.size(db_files) %>% prettyunits::pretty_bytes() %>% set_names(db_files)
# large.duckdb     large.duckdb.wal         large.sqlite     large_int.duckdb 
#  " 12.29 kB"          "243.70 MB"          "273.37 MB"          " 12.29 kB" 
# large_int.duckdb.wal     large_int.sqlite         small.sqlite 
#          "160.37 MB"          "256.53 MB"          " 24.58 kB"

To conclude: duckdb is around 2x faster than SQLite on this dataset. As expected, integer filtering is ~10x faster than character filtering (in duckdb at least). duckdb also saves some space! Good to know for future projects!