Reputation: 5898

Split overlapping intervals into non-overlapping intervals, within values of an identifier

I would like to take a set of intervals, possibly overlapping, within categories of an identifier and create new intervals that are either exactly overlapping (ie same start/end values) or completely non-overlapping. These new intervals should collectively span the range of the original intervals and not include any ranges not in the original intervals.

This needs to be a relatively fast operation because I'm working with lots of data.

Here is some example data:

library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id1))
x <- data.table(start1,end1,id1,obs)

    > x
   start1 end1 id1         obs
1:      1   10   1 -0.79701638
2:      7   12   1 -0.09251333
3:      9   15   1 -0.08118742
4:     17   20   1 -2.33312797
5:     18   23   1  0.26581138
6:      1    3   2 -0.34314127
7:      3    5   2 -0.17196880
8:     20   25   2  0.11614842

The output should be something like this:

    id1 start1 end1 i.start1 i.end1         obs
 1:   1      1    6        1     10 -0.79701638
 2:   1      7    8        1     10 -0.79701638
 3:   1      7    8        7     12 -0.09251333
 4:   1      9   10        1     10 -0.79701638
 5:   1      9   10        7     12 -0.09251333
 6:   1      9   10        9     15 -0.08118742
 7:   1     11   12        7     12 -0.09251333
 8:   1     11   12        9     15 -0.08118742
 9:   1     13   15        9     15 -0.08118742
10:   1     17   17       17     20 -2.33312797
11:   1     18   20       17     20 -2.33312797
12:   1     18   20       18     23  0.26581138
13:   1     21   23       18     23  0.26581138
14:   2      1    2        1      3 -0.34314127
15:   2      3    3        1      3 -0.34314127
16:   2      3    3        3      5 -0.17196880
17:   2      4    5        3      5 -0.17196880
18:   2     20   25       20     25  0.11614842

I found this algorithm that corresponds to what I want: https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541

I tried programming it directly but it was quite slow.

Upvotes: 2

Answers (3)

Michael

Reputation: 5898

I wrote a package, intervalaverage, for this and some related functions:

library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id1))
x <- data.table(start1,end1,id1,obs)

library(intervalaverage)

x[, start1:=as.integer(start1)]
x[, end1:=as.integer(end1)]
isolateoverlaps(x,interval_vars = c("start1","end1"),group_vars = "id1")

    id1 start end start1 end1         obs
 1:   1     1   6      1   10 -0.79701638
 2:   1     7   8      1   10 -0.79701638
 3:   1     9  10      1   10 -0.79701638
 4:   1     7   8      7   12 -0.09251333
 5:   1     9  10      7   12 -0.09251333
 6:   1    11  12      7   12 -0.09251333
 7:   1     9  10      9   15 -0.08118742
 8:   1    11  12      9   15 -0.08118742
 9:   1    13  15      9   15 -0.08118742
10:   1    17  17     17   20 -2.33312797
11:   1    18  20     17   20 -2.33312797
12:   1    18  20     18   23  0.26581138
13:   1    21  23     18   23  0.26581138
14:   2     1   2      1    3 -0.34314127
15:   2     3   3      1    3 -0.34314127
16:   2     3   3      3    5 -0.17196880
17:   2     4   5      3    5 -0.17196880
18:   2    20  25     20   25  0.11614842

y <- data.table(start1=c(1L,5L,5L),end1=c(5L,5L,10L),id=c(1L,1L,1L)) 
isolateoverlaps(y,interval_vars = c("start1","end1"),group_vars = "id")

   id start end start1 end1
1:  1     1   4      1    5
2:  1     5   5      1    5
3:  1     5   5      5    5
4:  1     5   5      5   10
5:  1     6  10      5   10

Upvotes: 0

chinsoon12

Reputation: 25225

Here is another option.

#borrowing idea from https://stackoverflow.com/a/28938694/1989480
#group overlapping intervals together
x[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]

#cut those intervals into non-overlapping ones
itvl <- x[, {
    s <- sort(c(start - 1L, start, end, end + 1L))
    as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
    }, by=.(id, g)]

#get OP's desired output using non-equi join
x[itvl, on=.(id, start<=V1, end>=V1),
    .(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
    allow.cartesian=TRUE]

output:

    id1 start1 end1 i.start1 i.end1         obs
 1:   1      1    6        1     10 -0.79701638
 2:   1      7    8        1     10 -0.79701638
 3:   1      7    8        7     12 -0.09251333
 4:   1      9   10        1     10 -0.79701638
 5:   1      9   10        7     12 -0.09251333
 6:   1      9   10        9     15 -0.08118742
 7:   1     11   12        7     12 -0.09251333
 8:   1     11   12        9     15 -0.08118742
 9:   1     13   15        9     15 -0.08118742
10:   1     17   17       17     20 -2.33312797
11:   1     18   20       17     20 -2.33312797
12:   1     18   20       18     23  0.26581138
13:   1     21   23       18     23  0.26581138
14:   2      1    2        1      3 -0.34314127
15:   2      3    3        1      3 -0.34314127
16:   2      3    3        3      5 -0.17196880
17:   2      4    5        3      5 -0.17196880
18:   2     20   25       20     25  0.11614842

data:

library(data.table)
set.seed(1113)
id <- c(1,1,1,1,1,2,2,2)
x <- data.table(start=c(1,7,9, 17, 18,1,3,20),
    end=c(10,12,15, 20, 23,3,5,25),
    id=id,
    obs=rnorm(length(id)))

addressing comment:

library(data.table)
set.seed(1113)
x2 <- data.table(start=c(1,5,5),end=c(5,5,10),id=c(1,1,1),obs=rnorm(3))
x2[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]
itvl <- x2[, {
    s <- sort(c(start - 1L, start, end, end + 1L))
    as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
    }, by=.(id, g)]
ans <- x2[itvl, on=.(id, start<=V1, end>=V1),
    .(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
    allow.cartesian=TRUE]
ans[start1 >= i.start1 & end1 <= i.end1]

output:

   id1 start1 end1 i.start1 i.end1         obs
1:   1      1    4        1      5 -0.79701638
2:   1      4    5        1      5 -0.79701638
3:   1      5    5        1      5 -0.79701638
4:   1      5    5        5      5 -0.09251333
5:   1      5    5        5     10 -0.08118742
6:   1      5    6        5     10 -0.08118742
7:   1      6   10        5     10 -0.08118742

Upvotes: 1

Michael

Reputation: 5898

Here's my solution. It's based on the algorithm here (https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541), but uses data.table, shift, and vectorized ifelse statements for efficiency. It also differs from the alrgorithm in that my code allows this operation to be performed separately for multiple datasets identified by an id_column. My approach also ignores keeping track of rows (ie "attribute") since it's not necessary to define this when the intervals can be easily merged back to the original data using foverlaps anyway. foverlaps also serves the purpose of excluding gaps

Please tell me whether you see any inefficiencies

remove_overlaps <- function(x, start_column, end_column, id_column=NULL){

  xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)

  xd[variable==start_column,end:=FALSE]
  xd[variable==end_column,end:=TRUE]
  setorderv(xd,c(id_column, "value","end"))

  xd[,end_next:=shift(end,type="lead"),by=id_column]
  xd[,value_next:=shift(value,type="lead"),by=id_column]


  #excluding end_next when missing should cause this to ignore the last row in each group
  #because this element will be NA as defined by shift
  temp <- xd[,.SD[!is.na(end_next),list(
    start=ifelse(!end,value,value+1),
    end=ifelse(!end_next,value_next-1,value_next)
  )],by=id_column]

  temp <- temp[end>=start]

  setnames(temp , c("start","end"),c(start_column,end_column))

  setkeyv(temp,c(id_column,start_column,end_column))

  out <- foverlaps(x,temp)
  setorderv(out, c(id_column,start_column,
                   paste0("i.",start_column),
                   paste0("i.",end_column)
  ))

  out
}
remove_overlaps(x, start_column="start1",end_column="end1",id_column="id1")

Also, for what it's worth I don't think the suggestion linked on that page is correct on how to exclude gaps.

This answer doesn't take account of gaps (gaps should not appear in output), so I refined it: * If e=false, add a to S. If e=true, take away a from S. * Define n'=n if e=false or n'=n+1 if e=true * Define m'=m-1 if f=false or m'=m if f=true * If n' <= m' and (e and not f) = false, output (n',m',S), otherwise output nothing. – silentman.it Aug 23 '18 at 12:19

Here is a second version of this code algorithm implemented in R: remove_overlaps doesn't explicitly use silentman.it's suggestion to exclude gaps, whereas remove_overlaps1 uses that suggestion. Note that both functions do exclude gaps via the subsequent call to foverlaps, which only returns intervals if they partially match to those in x (the original data).

library(data.table)



remove_overlaps1 <- function(x, start_column, end_column, id_column=NULL){

  xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)

  xd[variable==start_column,end:=FALSE]
  xd[variable==end_column,end:=TRUE]
  setorderv(xd,c(id_column, "value","end"))

  xd[,end_next:=shift(end,type="lead"),by=id_column]
  xd[,value_next:=shift(value,type="lead"),by=id_column]

###subset to rows where (e & !f) = FALSE, as per comment suggestion on linked answer
  temp <- xd[,.SD[!is.na(end_next)&!(end & !end_next),list(
    start=ifelse(!end,value,value+1),
    end=ifelse(!end_next,value_next-1,value_next)
  )],by=id_column]

  temp <- temp[end>=start]

  setnames(temp , c("start","end"),c(start_column,end_column))

  setkeyv(temp,c(id_column,start_column,end_column))


  out <- foverlaps(x,temp) #this should exclude gaps since foverlaps by default subsets to 
  setorderv(out, c(id_column,start_column,
                   paste0("i.",start_column),
                   paste0("i.",end_column)
  ))

  out
}

Example data:

library(data.table)
x <-
  structure(
    list(
      native_id = c(
        "1",
        "1",
        "1",
        "1",
        "1"
      ),
      n_start_date = c(14761, 14775,
                       14789, 14803, 14817),
      n_end_date = c(14776, 14790, 14804, 14818,
                     14832),
      obs = c(
        31.668140525481,
        34.8623263656539,
        35.0841466093899,
        37.2281249364127,
        36.3726151694052
      )
    ),
    row.names = c(NA,-5L),
    class = "data.frame",
    .Names = c("native_id",
               "n_start_date", "n_end_date", "obs")
  )

setDT(x)

> x
   native_id n_start_date n_end_date      obs
1:         1        14761      14776 31.66814
2:         1        14775      14790 34.86233
3:         1        14789      14804 35.08415
4:         1        14803      14818 37.22812
5:         1        14817      14832 36.37262

Results:

> remove_overlaps(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
    native_id n_start_date n_end_date i.n_start_date i.n_end_date      obs
 1:         1        14761      14774          14761        14776 31.66814
 2:         1        14775      14776          14761        14776 31.66814
 3:         1        14775      14776          14775        14790 34.86233
 4:         1        14777      14788          14775        14790 34.86233
 5:         1        14789      14790          14775        14790 34.86233
 6:         1        14789      14790          14789        14804 35.08415
 7:         1        14791      14802          14789        14804 35.08415
 8:         1        14803      14804          14789        14804 35.08415
 9:         1        14803      14804          14803        14818 37.22812
10:         1        14805      14816          14803        14818 37.22812
11:         1        14817      14818          14803        14818 37.22812
12:         1        14817      14818          14817        14832 36.37262
13:         1        14819      14832          14817        14832 36.37262

Seemingly incorrect, excludes too many intervals:

>  remove_overlaps1(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
    native_id n_start_date n_end_date i.n_start_date i.n_end_date      obs
 1:         1        14761      14774          14761        14776 31.66814
 2:         1        14775      14776          14761        14776 31.66814
 3:         1        14775      14776          14775        14790 34.86233
 4:         1        14789      14790          14775        14790 34.86233
 5:         1        14789      14790          14789        14804 35.08415
 6:         1        14803      14804          14789        14804 35.08415
 7:         1        14803      14804          14803        14818 37.22812
 8:         1        14817      14818          14803        14818 37.22812
 9:         1        14817      14818          14817        14832 36.37262
10:         1        14819      14832          14817        14832 36.37262

Upvotes: 0

Split overlapping intervals into non-overlapping intervals, within values of an identifier

Answers (3)

Related Questions