Henry David Thorough
Henry David Thorough

Reputation: 900

Segment vector according to whether or not values are above a threshold in R

I have a long vector and I need to divide it into segments according to a threshold. A segment is consecutive values over the threshold. When values drop below the threshold, the segment ends and the next segment begins where the values once again cross above the threshold. I need to record the start and end indices of each segment.

Below is an inefficient implementation. What's the fastest and most appropriate way to write this? This is pretty ugly, I have to assume that there's a cleaner implementation.

set.seed(10)
test.vec <- rnorm(100, 8, 10)
threshold <- 0
segments <- list()
in.segment <- FALSE
for(i in 1:length(test.vec)){

    # If we're in a segment
    if(in.segment){
        if(test.vec[i] > threshold){
            next
        }else{
            end.ind <- i - 1
            in.segment <- FALSE
            segments[[length(segments) + 1]] <- c(start.ind, end.ind)
        }
    }

    # if not in segment
    else{
        if(test.vec[i] > threshold){        
            start.ind <- i
            in.segment <- TRUE
        }
    }
}

EDIT: Runtime of all solutions

Thanks for all the replies, this has been helpful and very instructive. A small test of all five solutions is below (the four provided plus the original example). As you can see, all four are a huge improvement over the original solution, but Khashaa's solution is by far the fastest.

set.seed(1)
test.vec <- rnorm(1e6, 8, 10);threshold <- 0

originalFunction <- function(x, threshold){
    segments <- list()
    in.segment <- FALSE
    for(i in 1:length(test.vec)){

    # If we're in a segment
        if(in.segment){
            if(test.vec[i] > threshold){
                next
            }else{
                end.ind <- i - 1
                in.segment <- FALSE
                segments[[length(segments) + 1]] <- c(start.ind, end.ind)
            }
        }

    # if not in segment
        else{
            if(test.vec[i] > threshold){        
                start.ind <- i
                in.segment <- TRUE
            }
        }
    }
    segments
}

SimonG <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)
}

Rcpp::cppFunction('DataFrame Khashaa(NumericVector x, double threshold) {
  x.push_back(-1);
  int n = x.size(), startind, endind; 
  std::vector<int> startinds, endinds;
  bool insegment = false;
  for(int i=0; i<n; i++){
    if(!insegment){
      if(x[i] > threshold){        
        startind = i + 1;
        insegment = true;          }
    }else{
      if(x[i] < threshold){
        endind = i;
        insegment = false;
        startinds.push_back(startind); 
        endinds.push_back(endind);
      }
    }
  }
  return DataFrame::create(_["start"]= startinds, _["end"]= endinds);
}')

bgoldst <- function(x, threshold){
    with(rle(x>threshold),
         t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,])   
}

ClausWilke <- function(x, threshold){
    suppressMessages(require(dplyr, quietly = TRUE))
    in.segment <- (x > threshold)
    start <- which(c(FALSE, in.segment) == TRUE & lag(c(FALSE, in.segment) == FALSE)) - 1
    end <- which(c(in.segment, FALSE) == TRUE & lead(c(in.segment, FALSE) == FALSE))
    data.frame(start, end)    
}

system.time({ originalFunction(test.vec, threshold); })
 ## user  system elapsed 
 ## 66.539   1.232  67.770 
system.time({ SimonG(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.028   0.008   0.036 
system.time({ Khashaa(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.008   0.000   0.008 
system.time({ bgoldst(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.065   0.000   0.065 
system.time({ ClausWilke(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.274   0.012   0.285 

Upvotes: 8

Views: 1946

Answers (4)

Khashaa
Khashaa

Reputation: 7373

I like for loops for translation to Rcpp is straightforward.

Rcpp::cppFunction('DataFrame findSegment(NumericVector x, double threshold) {
  x.push_back(-1);
  int n = x.size(), startind, endind; 
  std::vector<int> startinds, endinds;
  bool insegment = false;
  for(int i=0; i<n; i++){
    if(!insegment){
      if(x[i] > threshold){        
        startind = i + 1;
        insegment = true;          }
    }else{
      if(x[i] < threshold){
        endind = i;
        insegment = false;
        startinds.push_back(startind); 
        endinds.push_back(endind);
      }
    }
  }
  return DataFrame::create(_["start"]= startinds, _["end"]= endinds);
}')
set.seed(1); test.vec <- rnorm(1e7,8,10); threshold <- 0;
system.time(findSegment(test.vec, threshold))

#   user  system elapsed 
#  0.045   0.000   0.045 

# @SimonG's solution
system.time(findSegments(test.vec, threshold))
#   user  system elapsed 
#  0.533   0.012   0.548 

Upvotes: 5

Claus Wilke
Claus Wilke

Reputation: 17820

Here is another solution that I think is simpler. Note that you have to use set.seed(10), not set.seed <- 10, to set the seed of the random number generator.

require(dplyr) # for lead() and lag()

set.seed(10)
test.vec <- rnorm(100, 8, 10)
threshold <- 0

in.segment <- (test.vec > threshold)
start <- which(c(FALSE, in.segment) == TRUE & lag(c(FALSE, in.segment) == FALSE)) - 1
end <- which(c(in.segment, FALSE) == TRUE & lead(c(in.segment, FALSE) == FALSE))
segments <- data.frame(start, end)

head(segments)
##   start end
## 1     1   2
## 2     4   6
## 3     8   8
## 4    10  16
## 5    18  21
## 6    23  23

In general, in R, if you find yourself writing complicated loops and if statements you're probably doing it wrong. Most problems can be solved in a vectorized form.

Upvotes: 3

SimonG
SimonG

Reputation: 4881

Here's another option, mostly using which. The start and end points are determined by finding the non-consecutive elements of the hit sequence.

test.vec <- rnorm(100, 8, 10)
threshold <- 0


findSegments <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)

}

findSegments(test.vec, threshold=0)

This gives something like:

> findSegments(test.vec, threshold=0)
      starts ends
 [1,]      1    3
 [2,]      5    7
 [3,]      9   11
 [4,]     13   28
 [5,]     30   30
 [6,]     32   32
 [7,]     34   36
 [8,]     38   39
 [9,]     41   41
[10,]     43   43
[11,]     46   51
[12,]     54   54
[13,]     56   61
[14,]     63   67
[15,]     69   72
[16,]     76   77
[17,]     80   81
[18,]     83   84
[19,]     86   88
[20,]     90   92
[21,]     94   95
[22,]     97   97
[23,]    100  100

Compare that to the original sequence:

> round(test.vec,1)
  [1]  20.7  15.7   4.3 -15.1  24.6   9.4  23.2  -4.5  16.9  20.9  13.2  -1.2
 [13]  22.6   7.7   6.0   6.6   4.1  21.3   5.3  16.7  11.4  16.7  19.6  16.7
 [25]  11.6   7.3   3.7   8.4  -4.5  11.7  -7.1   8.4 -18.5  12.8  22.5  11.0
 [37]  -3.3  11.1   6.9  -7.9  22.9  -3.7   3.5  -7.1  -5.9   3.5  13.2  20.0
 [49]  13.2  23.4  15.9  -5.0  -6.3  10.0  -6.2   4.7   2.1  26.4   5.9  27.3
 [61]  14.3 -12.4  28.4  30.9  18.2  11.4   5.7  -4.5   6.2  12.0  10.9  11.1
 [73]  -2.0  -9.0  -1.4  15.4  19.1  -1.6  -5.4   5.4   7.8  -5.6  15.2  13.8
 [85] -18.8   7.1  17.1   9.3  -3.9  22.6   1.7  28.9 -21.3  21.2   8.2 -15.4
 [97]   3.2 -10.2  -6.2  14.1

Upvotes: 6

bgoldst
bgoldst

Reputation: 35324

with(rle(test.vec>threshold),t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,]);
##       [,1] [,2]
##  [1,]    1    8
##  [2,]   10   13
##  [3,]   16   17
##  [4,]   20   26
##  [5,]   28   28
##  [6,]   30   34
##  [7,]   36   38
##  [8,]   41   46
##  [9,]   48   49
## [10,]   51   53
## [11,]   55   81
## [12,]   84   90
## [13,]   92  100

Explanation

test.vec>threshold
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

Compute which elements in the input vector are above the threshold using vectorized comparison.


rle(...)
## Run Length Encoding
##   lengths: int [1:25] 8 1 4 2 2 2 7 1 1 1 ...
##   values : logi [1:25] TRUE FALSE TRUE FALSE TRUE FALSE ...

Compute the run-length encoding of the logical vector. This returns a list classed as 'rle' which contains two named components: lengths, containing the lengths of each run-length, and values, containing the value that ran that length, which in this case will be TRUE or FALSE, with the former representing a segment of interest, and the latter representing a non-segment run length.


with(...,...)

The first argument is the run-length encoding as described above. This will evaluate the second argument in a virtual environment consisting of the 'rle'-classed list, thus making the lengths and values components accessible as lexical variables.

Below I dive into the contents of the second argument.


cumsum(lengths)
##  [1]   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100

Compute the cumulative sum of the lengths. This will form the basis for computing both the start indexes and end indexes of each run-length. Critical point: Each element of the cumsum represents the end index of that run-length.


rep(...,2L)
##  [1]   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100

Duplicate the cumulative sum. The first repetition will serve as the basis for the start indexes, the second the end. I will henceforth refer to these repetitions as the "start-index repetition" and the "end-index repetition".


c(0L,...[-length(lengths)])
##  [1]   0   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100

This removes the last element at the end of the start-index repetition, and prepends a zero to the beginning of it. This effectively lags the start-index repetition by one element. This is necessary because we need to compute each start index by adding one to the previous run-length's end index, taking zero as the end index of the non-existent run-length prior to the first.


matrix(...,2L,byrow=T)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## [1,]    0    8    9   13   15   17   19   26   27    28    29    34    35    38    40    46    47    49    50    53    54    81    83    90    91
## [2,]    8    9   13   15   17   19   26   27   28    29    34    35    38    40    46    47    49    50    53    54    81    83    90    91   100

This builds a two-row matrix out of the previous result. The lagged start-index repetition is the top row, the end-index repetition is the bottom row.


...+1:0
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## [1,]    1    9   10   14   16   18   20   27   28    29    30    35    36    39    41    47    48    50    51    54    55    82    84    91    92
## [2,]    8    9   13   15   17   19   26   27   28    29    34    35    38    40    46    47    49    50    53    54    81    83    90    91   100

R cycles this two-element addend across rows first, then across columns, thus this adds one to the top row. This completes the computation of the start indexes.


t(...)
##       [,1] [,2]
##  [1,]    1    8
##  [2,]    9    9
##  [3,]   10   13
##  [4,]   14   15
##  [5,]   16   17
##  [6,]   18   19
##  [7,]   20   26
##  [8,]   27   27
##  [9,]   28   28
## [10,]   29   29
## [11,]   30   34
## [12,]   35   35
## [13,]   36   38
## [14,]   39   40
## [15,]   41   46
## [16,]   47   47
## [17,]   48   49
## [18,]   50   50
## [19,]   51   53
## [20,]   54   54
## [21,]   55   81
## [22,]   82   83
## [23,]   84   90
## [24,]   91   91
## [25,]   92  100

Transpose to a two-column matrix. This is not entirely necessary, if you're ok with getting the result as a two-row matrix.


...[values,]
##       [,1] [,2]
##  [1,]    1    8
##  [2,]   10   13
##  [3,]   16   17
##  [4,]   20   26
##  [5,]   28   28
##  [6,]   30   34
##  [7,]   36   38
##  [8,]   41   46
##  [9,]   48   49
## [10,]   51   53
## [11,]   55   81
## [12,]   84   90
## [13,]   92  100

Subset just the segments of interest. Since values is a logical vector representing which run-lengths surpassed the threshold, we can use it directly as a row index vector.


Performance

I guess I'm screwing myself here, but SimonG's solution performs about twice as well as mine:

bgoldst <- function() with(rle(test.vec>threshold),t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,]);
simong <- function() findSegments(test.vec,threshold);
set.seed(1); test.vec <- rnorm(1e7,8,10); threshold <- 0;
identical(bgoldst(),unname(simong()));
## [1] TRUE
system.time({ bgoldst(); })
##    user  system elapsed
##   1.344   0.204   1.551
system.time({ simong(); })
##    user  system elapsed
##   0.656   0.109   0.762

+1 from me...

Upvotes: 4

Related Questions