Reputation: 5898
I would like to take a set of intervals, possibly overlapping, within categories of an identifier and create new intervals that are either exactly overlapping (ie same start/end values) or completely non-overlapping. These new intervals should collectively span the range of the original intervals and not include any ranges not in the original intervals.
This needs to be a relatively fast operation because I'm working with lots of data.
Here is some example data:
library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id1))
x <- data.table(start1,end1,id1,obs)
> x
start1 end1 id1 obs
1: 1 10 1 -0.79701638
2: 7 12 1 -0.09251333
3: 9 15 1 -0.08118742
4: 17 20 1 -2.33312797
5: 18 23 1 0.26581138
6: 1 3 2 -0.34314127
7: 3 5 2 -0.17196880
8: 20 25 2 0.11614842
The output should be something like this:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 7 8 7 12 -0.09251333
4: 1 9 10 1 10 -0.79701638
5: 1 9 10 7 12 -0.09251333
6: 1 9 10 9 15 -0.08118742
7: 1 11 12 7 12 -0.09251333
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
I found this algorithm that corresponds to what I want: https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541
I tried programming it directly but it was quite slow.
Upvotes: 2
Views: 2785
Reputation: 5898
I wrote a package, intervalaverage
, for this and some related functions:
library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id1))
x <- data.table(start1,end1,id1,obs)
library(intervalaverage)
x[, start1:=as.integer(start1)]
x[, end1:=as.integer(end1)]
isolateoverlaps(x,interval_vars = c("start1","end1"),group_vars = "id1")
id1 start end start1 end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 9 10 1 10 -0.79701638
4: 1 7 8 7 12 -0.09251333
5: 1 9 10 7 12 -0.09251333
6: 1 11 12 7 12 -0.09251333
7: 1 9 10 9 15 -0.08118742
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
y <- data.table(start1=c(1L,5L,5L),end1=c(5L,5L,10L),id=c(1L,1L,1L))
isolateoverlaps(y,interval_vars = c("start1","end1"),group_vars = "id")
id start end start1 end1
1: 1 1 4 1 5
2: 1 5 5 1 5
3: 1 5 5 5 5
4: 1 5 5 5 10
5: 1 6 10 5 10
Upvotes: 0
Reputation: 25225
Here is another option.
#borrowing idea from https://stackoverflow.com/a/28938694/1989480
#group overlapping intervals together
x[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]
#cut those intervals into non-overlapping ones
itvl <- x[, {
s <- sort(c(start - 1L, start, end, end + 1L))
as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
}, by=.(id, g)]
#get OP's desired output using non-equi join
x[itvl, on=.(id, start<=V1, end>=V1),
.(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
allow.cartesian=TRUE]
output:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 7 8 7 12 -0.09251333
4: 1 9 10 1 10 -0.79701638
5: 1 9 10 7 12 -0.09251333
6: 1 9 10 9 15 -0.08118742
7: 1 11 12 7 12 -0.09251333
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
data:
library(data.table)
set.seed(1113)
id <- c(1,1,1,1,1,2,2,2)
x <- data.table(start=c(1,7,9, 17, 18,1,3,20),
end=c(10,12,15, 20, 23,3,5,25),
id=id,
obs=rnorm(length(id)))
addressing comment:
library(data.table)
set.seed(1113)
x2 <- data.table(start=c(1,5,5),end=c(5,5,10),id=c(1,1,1),obs=rnorm(3))
x2[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]
itvl <- x2[, {
s <- sort(c(start - 1L, start, end, end + 1L))
as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
}, by=.(id, g)]
ans <- x2[itvl, on=.(id, start<=V1, end>=V1),
.(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
allow.cartesian=TRUE]
ans[start1 >= i.start1 & end1 <= i.end1]
output:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 4 1 5 -0.79701638
2: 1 4 5 1 5 -0.79701638
3: 1 5 5 1 5 -0.79701638
4: 1 5 5 5 5 -0.09251333
5: 1 5 5 5 10 -0.08118742
6: 1 5 6 5 10 -0.08118742
7: 1 6 10 5 10 -0.08118742
Upvotes: 1
Reputation: 5898
Here's my solution.
It's based on the algorithm here (https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541), but uses data.table, shift, and vectorized ifelse statements for efficiency. It also differs from the alrgorithm in that my code allows this operation to be performed separately for multiple datasets identified by an id_column. My approach also ignores keeping track of rows (ie "attribute") since it's not necessary to define this when the intervals can be easily merged back to the original data using foverlaps
anyway. foverlaps also serves the purpose of excluding gaps
Please tell me whether you see any inefficiencies
remove_overlaps <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
#excluding end_next when missing should cause this to ignore the last row in each group
#because this element will be NA as defined by shift
temp <- xd[,.SD[!is.na(end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp)
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
remove_overlaps(x, start_column="start1",end_column="end1",id_column="id1")
Also, for what it's worth I don't think the suggestion linked on that page is correct on how to exclude gaps.
This answer doesn't take account of gaps (gaps should not appear in output), so I refined it: * If e=false, add a to S. If e=true, take away a from S. * Define n'=n if e=false or n'=n+1 if e=true * Define m'=m-1 if f=false or m'=m if f=true * If n' <= m' and (e and not f) = false, output (n',m',S), otherwise output nothing. – silentman.it Aug 23 '18 at 12:19
Here is a second version of this code algorithm implemented in R: remove_overlaps doesn't explicitly use silentman.it's suggestion to exclude gaps, whereas remove_overlaps1 uses that suggestion. Note that both functions do exclude gaps via the subsequent call to foverlaps, which only returns intervals if they partially match to those in x (the original data).
library(data.table)
remove_overlaps1 <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
###subset to rows where (e & !f) = FALSE, as per comment suggestion on linked answer
temp <- xd[,.SD[!is.na(end_next)&!(end & !end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp) #this should exclude gaps since foverlaps by default subsets to
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
Example data:
library(data.table)
x <-
structure(
list(
native_id = c(
"1",
"1",
"1",
"1",
"1"
),
n_start_date = c(14761, 14775,
14789, 14803, 14817),
n_end_date = c(14776, 14790, 14804, 14818,
14832),
obs = c(
31.668140525481,
34.8623263656539,
35.0841466093899,
37.2281249364127,
36.3726151694052
)
),
row.names = c(NA,-5L),
class = "data.frame",
.Names = c("native_id",
"n_start_date", "n_end_date", "obs")
)
setDT(x)
> x
native_id n_start_date n_end_date obs
1: 1 14761 14776 31.66814
2: 1 14775 14790 34.86233
3: 1 14789 14804 35.08415
4: 1 14803 14818 37.22812
5: 1 14817 14832 36.37262
Results:
> remove_overlaps(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14777 14788 14775 14790 34.86233
5: 1 14789 14790 14775 14790 34.86233
6: 1 14789 14790 14789 14804 35.08415
7: 1 14791 14802 14789 14804 35.08415
8: 1 14803 14804 14789 14804 35.08415
9: 1 14803 14804 14803 14818 37.22812
10: 1 14805 14816 14803 14818 37.22812
11: 1 14817 14818 14803 14818 37.22812
12: 1 14817 14818 14817 14832 36.37262
13: 1 14819 14832 14817 14832 36.37262
Seemingly incorrect, excludes too many intervals:
> remove_overlaps1(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14789 14790 14775 14790 34.86233
5: 1 14789 14790 14789 14804 35.08415
6: 1 14803 14804 14789 14804 35.08415
7: 1 14803 14804 14803 14818 37.22812
8: 1 14817 14818 14803 14818 37.22812
9: 1 14817 14818 14817 14832 36.37262
10: 1 14819 14832 14817 14832 36.37262
Upvotes: 0