Faster way to capture regex

Question

I want to use regex to capture substrings - I already have a working solution, but I wonder if there is a faster solution. I am applying applyCaptureRegex on a vector with about 400.000 entries.

 exampleData <- as.data.frame(c("[hg19:21:34809787-34809808:+]","[hg19:11:105851118-105851139:+]","[hg19:17:7482245-7482266:+]","[hg19:6:19839915-19839936:+]"))

captureRegex <- function(captRegEx,str){
  sapply(regmatches(str,gregexpr(captRegEx,str))[[1]], function(m) regmatches(m,regexec(captRegEx,m)))
}

applyCaptureRegex <- function(mir,r){
  mir <- unlist(apply(mir, 1, function(x) captureRegex(r,x[1])))
  mir <- matrix(mir ,ncol=5, byrow = TRUE)
  mir
}

Usage and results:

> captureRegex("$$[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])$$","[hg19:12:125627828-125627847:-]")
$`[hg19:12:125627828-125627847:-]`
[1] "[hg19:12:125627828-125627847:-]" "12" "125627828" "125627847" "-"   

> applyCaptureRegex(exampleData,"$$[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])$$")
     [,1]                              [,2] [,3]        [,4]        [,5]
[1,] "[hg19:21:34809787-34809808:+]"   "21" "34809787"  "34809808"  "+" 
[2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+" 
[3,] "[hg19:17:7482245-7482266:+]"     "17" "7482245"   "7482266"   "+" 
[4,] "[hg19:6:19839915-19839936:+]"    "6"  "19839915"  "19839936"  "+"

Thank you!

hwnd · Accepted Answer

Why reinvent the wheel? You have several library packages to choose from with functions that return a character matrix with one column for each capturing group in your pattern.

stri_match_all_regex — stringi

x <- c('[hg19:21:34809787-34809808:+]', '[hg19:11:105851118-105851139:+]', '[hg19:17:7482245-7482266:+]', '[hg19:6:19839915-19839936:+]')
do.call(rbind, stri_match_all_regex(x, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]'))
#      [,1]                              [,2] [,3]        [,4]        [,5]
# [1,] "[hg19:21:34809787-34809808:+]"   "21" "34809787"  "34809808"  "+" 
# [2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+" 
# [3,] "[hg19:17:7482245-7482266:+]"     "17" "7482245"   "7482266"   "+" 
# [4,] "[hg19:6:19839915-19839936:+]"    "6"  "19839915"  "19839936"  "+"

str_match — stringr

str_match(x, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]')

strapplyc — gsubfn

strapplyc(x, "(\[[^:]+:(\d+):(\d+)-(\d+):([-+])])", simplify = rbind)

Below is a benchmark comparison of all combined solutions.

x <- rep(c('[hg19:21:34809787-34809808:+]', 
           '[hg19:11:105851118-105851139:+]', 
           '[hg19:17:7482245-7482266:+]', 
           '[hg19:6:19839915-19839936:+]'), 1000)

applyCaptureRegex <- function(mir, r) {
  do.call(rbind, lapply(mir, function(x) regmatches(x, regexec(r, x))[[1]]))
}

gsubfn <- function(x1) strapplyc(x1, '(\[[^:]+:(\d+):(\d+)-(\d+):([-+])])', simplify = rbind)
regmtch <- function(x1) applyCaptureRegex(x1, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]')
stringr <- function(x1) str_match(x1, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]')
stringi <- function(x1) do.call(rbind, stri_match_all_regex(x1, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]'))

require(microbenchmark)
microbenchmark(gsubfn(x), regmtch(x), stringr(x), stringi(x))

Result

Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval
  gsubfn(x) 372.27072 382.82179 391.21837 388.32396 396.27361 449.03091   100
 regmtch(x) 394.03164 409.87523 419.42936 417.76770 427.08208 456.92460   100
 stringr(x)  65.81644  70.28327  76.02298  75.43162  78.92567 116.18026   100
 stringi(x)  15.88171  16.53047  17.52434  16.96127  17.76007  23.94449   100

Faster way to capture regex

Answers (1)

Related Questions