Reputation: 2263
I have a data.frame of 1e8 rows which has a column of results that I would like to filter by the following two columns: Model and subModel. I would like two figure out a how to join Model and subModel to make the filtering of "results" as fast as possible.
df<-data.frame(results=runif(600),
Model=rep(c("BLO","LR","DG"),each=200),
subModel=rep(rep(c("Fast","VeryFast","Slow","VerySlow"),each=5),3))
simple<-function(df){
df %>%
filter(Model=="BLO" & subModel=="VeryFast")%>%
select(results)
}
df1<-df %>%
mutate(CombinedModelSubModel=paste0(Model,".",subModel))
combined<-function(df1){
df1 %>%
filter(CombinedModelSubModel == "BLO.VeryFast")%>%
select(results)
}
Quick benchmark for only 600 rows:
> microbenchmark::microbenchmark(
+ simple(df),combined(df1),times=1000)
Unit: milliseconds
expr min lq mean median uq max neval
simple(df) 1.637144 2.099333 2.739295 2.414313 2.884119 13.09450 1000
combined(df1) 1.600907 2.066368 2.771701 2.404103 2.976401 11.33212 1000
There's no real difference but is there a better option?
Upvotes: 1
Views: 269
Reputation: 10192
If you already have a database such as sqlite
you could also use SQL.
You can use for example the RSQLite
Package.
But if speed is your main goal, always create a benchmark, so here goes...
I am comparing base-R, dplyr, data.table, SQLite, and dbplyr
(write dplyr code on your database).
If you want to have the performance of data.table but want to write dplyr code, checkout dtplyr
.
library(tidyverse)
library(data.table)
library(microbenchmark)
library(DBI)
library(dbplyr)
#################################################
### Small Data
# create data
set.seed(15412)
df <- data.frame(
results = runif(600),
Model = rep(c("BLO", "LR", "DG"), each = 200),
subModel = rep(rep(c("Fast", "VeryFast", "Slow", "VerySlow"), each = 5), 3)
)
# create data copies in correct formats
dti <- as_tibble(df)
dt <- data.table(df)
db_con <- dbConnect(RSQLite::SQLite(), "small.sqlite")
dbWriteTable(db_con, "small", df, overwrite = TRUE)
tbl <- tbl(db_con, "small")
# microbenchmark the data
microbenchmark(
base = {
idx <- df$Model == "BLO" & df$subModel == "VeryFast"
data.frame(results = df[idx, "results"])
},
dplyr = {
dti %>%
filter(Model == "BLO" & subModel == "VeryFast") %>%
select(results)
},
data.table = {
dt[Model == "BLO" & subModel == "VeryFast", .(results)]
},
SQLite = {
dbGetQuery(db_con, "SELECT results FROM small WHERE Model = 'BLO' AND subModel = 'VeryFast'")
},
dbplyr = {
tbl %>%
filter(Model == "BLO" & subModel == "VeryFast") %>%
select(results) %>%
collect()
}
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# base 117.0 180.85 318.689 203.55 292.35 2049.6 100 a
# dplyr 1990.6 2505.20 4612.249 3036.30 5441.35 26954.4 100 a
# data.table 1499.6 1811.95 4314.757 2059.70 3933.65 104310.4 100 a
# SQLite 456.4 575.50 1105.427 745.60 1026.25 6610.4 100 a
# dbplyr 17229.9 18604.15 32387.380 19980.60 36810.35 186945.5 100 b
We see for a small dataset, base can be fastest (which kinda suprised me, but well, here is the benchmark...), followed by sqlite.
###############################
### Larger Data
# create data = upsample smaller dataset
set.seed(1231)
N <- 1e7
idx <- sample.int(nrow(df), N, replace = TRUE)
df_large <- df[idx, ]
# create data copies in correct formats
dti_large <- as_tibble(df_large)
dt_large <- data.table(df_large)
db_con_large <- dbConnect(RSQLite::SQLite(), "large.sqlite")
dbWriteTable(db_con_large, "large", df_large, overwrite = TRUE)
tbl_large <- tbl(db_con_large, "large")
# start microbenchmark
microbenchmark(
idx = {
idx <- df_large$Model == "BLO" & df_large$subModel == "VeryFast"
data.frame(results = df_large[idx, "results"])
},
dplyr = {
dti_large %>%
filter(Model == "BLO" & subModel == "VeryFast") %>%
select(results)
},
data.table = {
dt_large[Model == "BLO" & subModel == "VeryFast", .(results)]
},
SQLite = {
dbGetQuery(db_con_large, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
},
dbplyr = {
tbl_large %>%
filter(Model == "BLO" & subModel == "VeryFast") %>%
select(results) %>%
collect()
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 177.9852 205.2101 503.28675 247.8158 806.4475 2050.5940 100 b
# dplyr 249.3802 272.4234 652.64045 608.5260 874.4070 3256.8231 100 c
# data.table 13.8028 15.2065 43.57601 16.7459 21.1673 596.9686 100 a
# SQLite 1004.3114 1085.9076 1256.08739 1153.1701 1295.7749 2944.5220 100 d
# dbplyr 1039.4028 1115.9282 1286.90739 1191.5628 1308.8002 4903.7834 100 d
Clear winner: data.table
followed by base and dplyr.
To be fair, comparing having everything already within a data.table is probably not fair. This benchmark might be a bit fairer, as it includes the data loading time from SQL to R into the data.table timings:
## Fair comparison
microbenchmark(
data.table_load = {
dd <- dbGetQuery(db_con_large, "SELECT results, Model, subModel FROM large;")
setDT(dd)
dd[Model == "BLO" & subModel == "VeryFast", .(results)]
},
SQLite = {
dbGetQuery(db_con_large, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
}
)
# Unit: seconds
# expr min lq mean median uq max neval cld
# data.table 4.976999 5.640749 6.713015 6.471158 7.583160 10.595039 100 b
# SQLite 1.009312 1.059838 1.250528 1.128358 1.269739 3.188415 100 a
So depends on your needs (read data once, filter many times => data.table; read data once, filter once => SQL) you might want to choose SQL after all.
As promised, here a quick comparison of of using SQLite vs duckdb, as well as a comparison between character and integer lookups.
library(duckdb)
df_large_int <- df_large
df_large_int$Model <- factor(df_large_int$Model)
(model_levels <- levels(df_large_int$Model))
# [1] "BLO" "DG" "LR"
summary(df_large_int$Model)
# BLO DG LR
# 3331088 3335538 3333374
df_large_int$Model <- as.integer(df_large_int$Model)
summary(df_large_int$Model)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1 1 2 2 3 3
# replace subModel
df_large_int$subModel <- factor(df_large_int$subModel)
(sub_model_levels <- levels(df_large_int$subModel))
# [1] "Fast" "Slow" "VeryFast" "VerySlow"
df_large_int$subModel <- as.integer(df_large_int$subModel)
# write large_int to sqlite
db_con_large_int <- dbConnect(RSQLite::SQLite(), "large_int.sqlite")
dbWriteTable(db_con_large_int, "large_int", df_large_int, overwrite = TRUE)
# write large table to duckdb
db_con_duck <- dbConnect(duckdb::duckdb(), dbdir = "large.duckdb", readonly = FALSE)
dbWriteTable(db_con_duck, "large", df_large, overwrite = TRUE)
# write large int table to duckdb
db_con_duck_int <- dbConnect(duckdb::duckdb(), dbdir = "large_int.duckdb", readonly = FALSE)
dbWriteTable(db_con_duck_int, "large_int", df_large_int, overwrite = TRUE)
microbenchmark(
sqlite_char = {
dbGetQuery(db_con_large, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
},
sqlite_int = {
dbGetQuery(db_con_large_int, "SELECT results FROM large_int WHERE Model = 1 AND subModel = 3")
},
duckdb_char = {
dbGetQuery(db_con_duck, "SELECT results FROM large WHERE Model = 'BLO' AND subModel = 'VeryFast'")
},
duckdb_int = {
dbGetQuery(db_con_duck_int, "SELECT results FROM large_int WHERE Model = 1 AND subModel = 3")
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# sqlite_char 947.4932 985.96300 1031.0347 1006.05580 1024.39820 1460.3388 100 d
# sqlite_int 722.4098 755.60875 804.6377 773.17705 806.51095 1290.4743 100 c
# duckdb_char 485.6975 501.28400 531.7413 513.73495 529.36840 1119.0317 100 b
# duckdb_int 44.4940 46.66485 70.5380 48.07785 54.06795 643.6255 100 a
# Check Database sizes
dbDisconnect(db_con_large)
dbDisconnect(db_con_large_int)
dbDisconnect(db_con_duck)
dbDisconnect(db_con_duck_int)
# Check Filesizes
db_files <- list.files(pattern = "(sqlite|duckdb|duckdb.wal)$")
file.size(db_files) %>% prettyunits::pretty_bytes() %>% set_names(db_files)
# large.duckdb large.duckdb.wal large.sqlite large_int.duckdb
# " 12.29 kB" "243.70 MB" "273.37 MB" " 12.29 kB"
# large_int.duckdb.wal large_int.sqlite small.sqlite
# "160.37 MB" "256.53 MB" " 24.58 kB"
To conclude: duckdb is around 2x faster than SQLite on this dataset. As expected, integer filtering is ~10x faster than character filtering (in duckdb at least). duckdb also saves some space! Good to know for future projects!
Upvotes: 2