Reputation: 584
I want to use regex to capture substrings - I already have a working solution, but I wonder if there is a faster solution. I am applying applyCaptureRegex
on a vector with about 400.000 entries.
exampleData <- as.data.frame(c("[hg19:21:34809787-34809808:+]","[hg19:11:105851118-105851139:+]","[hg19:17:7482245-7482266:+]","[hg19:6:19839915-19839936:+]"))
captureRegex <- function(captRegEx,str){
sapply(regmatches(str,gregexpr(captRegEx,str))[[1]], function(m) regmatches(m,regexec(captRegEx,m)))
}
applyCaptureRegex <- function(mir,r){
mir <- unlist(apply(mir, 1, function(x) captureRegex(r,x[1])))
mir <- matrix(mir ,ncol=5, byrow = TRUE)
mir
}
Usage and results:
> captureRegex("\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]","[hg19:12:125627828-125627847:-]")
$`[hg19:12:125627828-125627847:-]`
[1] "[hg19:12:125627828-125627847:-]" "12" "125627828" "125627847" "-"
> applyCaptureRegex(exampleData,"\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]")
[,1] [,2] [,3] [,4] [,5]
[1,] "[hg19:21:34809787-34809808:+]" "21" "34809787" "34809808" "+"
[2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+"
[3,] "[hg19:17:7482245-7482266:+]" "17" "7482245" "7482266" "+"
[4,] "[hg19:6:19839915-19839936:+]" "6" "19839915" "19839936" "+"
Thank you!
Upvotes: 3
Views: 272
Reputation: 70732
Why reinvent the wheel? You have several library packages to choose from with functions that return a character matrix with one column for each capturing group in your pattern.
stri_match_all_regex — stringi
x <- c('[hg19:21:34809787-34809808:+]', '[hg19:11:105851118-105851139:+]', '[hg19:17:7482245-7482266:+]', '[hg19:6:19839915-19839936:+]')
do.call(rbind, stri_match_all_regex(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
# [,1] [,2] [,3] [,4] [,5]
# [1,] "[hg19:21:34809787-34809808:+]" "21" "34809787" "34809808" "+"
# [2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+"
# [3,] "[hg19:17:7482245-7482266:+]" "17" "7482245" "7482266" "+"
# [4,] "[hg19:6:19839915-19839936:+]" "6" "19839915" "19839936" "+"
str_match — stringr
str_match(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
strapplyc — gsubfn
strapplyc(x, "(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])", simplify = rbind)
Below is a benchmark comparison of all combined solutions.
x <- rep(c('[hg19:21:34809787-34809808:+]',
'[hg19:11:105851118-105851139:+]',
'[hg19:17:7482245-7482266:+]',
'[hg19:6:19839915-19839936:+]'), 1000)
applyCaptureRegex <- function(mir, r) {
do.call(rbind, lapply(mir, function(x) regmatches(x, regexec(r, x))[[1]]))
}
gsubfn <- function(x1) strapplyc(x1, '(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])', simplify = rbind)
regmtch <- function(x1) applyCaptureRegex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringr <- function(x1) str_match(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringi <- function(x1) do.call(rbind, stri_match_all_regex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
require(microbenchmark)
microbenchmark(gsubfn(x), regmtch(x), stringr(x), stringi(x))
Result
Unit: milliseconds
expr min lq mean median uq max neval
gsubfn(x) 372.27072 382.82179 391.21837 388.32396 396.27361 449.03091 100
regmtch(x) 394.03164 409.87523 419.42936 417.76770 427.08208 456.92460 100
stringr(x) 65.81644 70.28327 76.02298 75.43162 78.92567 116.18026 100
stringi(x) 15.88171 16.53047 17.52434 16.96127 17.76007 23.94449 100
Upvotes: 8