Reputation: 149
I need to find overlapping ranges within a single data set but need to find them for each ID or factor level. Any assistance would be appreciated!
library(dplyr)
df_foo = read.table(
textConnection("Class Min Max
A 500 630
A 100 200
B 100 200
A 210 310
A 200 210
B 210 310
A 510 530
B 200 210
A 705 800
B 500 630
B 510 530
B 705 800"), header = TRUE
)
c = outer(df_foo$Max, df_foo$Min, ">")
d = outer(df_foo$Min, df_foo$Max, "<")
df_foo %>%
mutate(Overlap = apply(c & d, 1, sum) > 1
)
The result I am getting is as follows:
Class Min Max Overlap
1 A 500 630 TRUE
2 A 100 200 TRUE
3 B 100 200 TRUE
4 A 210 310 TRUE
5 A 200 210 TRUE
6 B 210 310 TRUE
7 A 510 530 TRUE
8 B 200 210 TRUE
9 A 705 800 TRUE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 TRUE
But I want to find the overlaps between each level of A and B like this:
Class Min Max Overlap
1 A 500 630 TRUE
2 A 100 200 FALSE
3 B 100 200 FALSE
4 A 210 310 FALSE
5 A 200 210 FALSE
6 B 210 310 FALSE
7 A 510 530 TRUE
8 B 200 210 FALSE
9 A 705 800 FALSE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 FALSE
Upvotes: 3
Views: 248
Reputation: 27732
An other data.table
approach.
The order of the sample data / ranges is irrelevant in this answer... foverlaps()
does all the hard work for you.
sample data
library( data.table )
dt <- as.data.table( df_foo )
code
#set key for the data.table
setkey(dt, Min, Max)
#perform overlap join, keep only joined ranges where the class is the same, and Min and Max are not the same.
result <- foverlaps( dt, dt )[ Class == i.Class & !(Min == i.Min | Max == i.Max | Min == i.Max | Max == i.Min), ]
#create a logical vector (i.e. Overlap) by checking if the (pasted) combination of
#Class, Min and Max exists in both 'dt' and 'result'
dt[ , Overlap := paste0( Class, Min, Max ) %in% paste0( result$Class, result$Min, result$Max) ][]
# Class Min Max Overlap
# 1: A 100 200 FALSE
# 2: B 100 200 FALSE
# 3: A 200 210 FALSE
# 4: B 200 210 FALSE
# 5: A 210 310 FALSE
# 6: B 210 310 FALSE
# 7: A 500 630 TRUE
# 8: B 500 630 TRUE
# 9: A 510 530 TRUE
# 10: B 510 530 TRUE
# 11: A 705 800 FALSE
# 12: B 705 800 FALSE
Upvotes: 1
Reputation: 2467
with dplyr
df=df_foo%>%group_by(Class)%>%
mutate(Overlap=if_else(Min<lag(Max,order_by=Class),TRUE,FALSE))
df$Overlap[which(df$Overlap==TRUE)-1]=TRUE
df$Overlap[which(is.na(df$Overlap))]=FALSE
> df
# A tibble: 12 x 4
# Groups: Class [2]
Class Min Max Overlap
<fct> <dbl> <dbl> <lgl>
1 A 100 200 FALSE
2 A 200 210 FALSE
3 A 210 310 FALSE
4 A 500 630 TRUE
5 A 510 530 TRUE
6 A 705 800 FALSE
7 B 100 200 FALSE
8 B 200 210 FALSE
9 B 210 310 FALSE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 FALSE
This code assumes your values are in ascending order, as it only checks with the previous row.
edit
Not the prettiest but works.
df_foo$Class=as.character.factor(df_foo$Class)
df_foo=as.data.frame(df_foo)
df_foo$Overlap=rep("FALSE",nrow(df_foo))
for (i in 1:nrow(df_foo)){
aux=FALSE
class=df_foo$Class[i]
df=df_foo[-i,]%>%filter(.,Class==class)
for (j in 1:nrow(df)){
if (df_foo[i,"Min"]<df[j,"Max"] & df_foo[i,"Max"] > df[j,"Min"]){
aux=TRUE
}
}
df_foo[i,"Overlap"]=aux
}
> df_foo
Class Min Max Overlap
1 A 500 630 TRUE
2 A 100 200 FALSE
3 B 100 200 FALSE
4 A 210 310 FALSE
5 A 200 210 FALSE
6 B 210 310 FALSE
7 A 510 530 TRUE
8 B 200 210 FALSE
9 A 705 800 FALSE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 FALSE
There must be a way to do it with dplyr
but I could not figure it out. What is going on is that it loops through each row of df_foo
; it generates a dataframe
with all other rows of the same group and compares if there is any overlap (min<max
and max<min
)
Upvotes: 1
Reputation: 5673
I have the answer in data.table
, the translation into dplyr
should be straigtworfard. The idea is to create a vector of the previous cumulated maximum , per class :
df_foo <- setDT(df_foo)
df_foo[, shiftedmaxmax := c(NA,cummax(Max)[1:(.N-1)]),by = Class ]
Class Min Max shiftedmaxmax
1: A 100 200 NA
2: A 200 210 200
3: A 210 310 210
4: A 500 630 310
5: A 510 530 630
6: A 705 800 630
7: B 100 200 NA
8: B 200 210 200
9: B 210 310 210
10: B 500 630 310
11: B 510 530 630
12: B 705 800 630
And to compare the minimum with the previous maximum of Maximum (so if it is inferior to this value, the current line is in the range of one previous line)
df_foo[,superposed := Min < shiftedmaxmax]
Class Min Max shiftedmaxmax superposed
1: A 100 200 NA NA
2: A 200 210 200 FALSE
3: A 210 310 210 FALSE
4: A 500 630 310 FALSE
5: A 510 530 630 TRUE
6: A 705 800 630 FALSE
7: B 100 200 NA NA
8: B 200 210 200 FALSE
9: B 210 310 210 FALSE
10: B 500 630 310 FALSE
11: B 510 530 630 TRUE
12: B 705 800 630 FALSE
The first superposed is missing, and you can get it this way:
df_foo[,superposedsource := Max %in% shiftedmaxmax[superposed],by = Class]
df_foo[,superposedtot := ifelse((superposed | superposedsource) &,T,F)]
Class Min Max shiftedmaxmax superposed superposedsource superposedtot
1: A 100 200 NA NA FALSE NA
2: A 200 210 200 FALSE FALSE FALSE
3: A 210 310 210 FALSE FALSE FALSE
4: A 500 630 310 FALSE TRUE TRUE
5: A 510 530 630 TRUE FALSE TRUE
6: A 705 800 630 FALSE FALSE FALSE
7: B 100 200 NA NA FALSE NA
8: B 200 210 200 FALSE FALSE FALSE
9: B 210 310 210 FALSE FALSE FALSE
10: B 500 630 310 FALSE TRUE TRUE
11: B 510 530 630 TRUE FALSE TRUE
12: B 705 800 630 FALSE FALSE FALSE
Upvotes: 2