Reputation: 149

Find overlapping ranges by factor level using R

I need to find overlapping ranges within a single data set but need to find them for each ID or factor level. Any assistance would be appreciated!

    library(dplyr)

df_foo = read.table(
  textConnection("Class    Min  Max
A    500  630
A    100  200
B    100  200
A    210  310
A    200  210
B    210  310
A    510  530
B    200  210
A    705  800
B    500  630
B    510  530
B    705  800"), header = TRUE
)

c = outer(df_foo$Max, df_foo$Min, ">")
d = outer(df_foo$Min, df_foo$Max, "<")

df_foo %>% 
  mutate(Overlap = apply(c & d, 1, sum) > 1 
  )

The result I am getting is as follows:

   Class Min Max Overlap
1      A 500 630    TRUE
2      A 100 200    TRUE
3      B 100 200    TRUE
4      A 210 310    TRUE
5      A 200 210    TRUE
6      B 210 310    TRUE
7      A 510 530    TRUE
8      B 200 210    TRUE
9      A 705 800    TRUE
10     B 500 630    TRUE
11     B 510 530    TRUE
12     B 705 800    TRUE

But I want to find the overlaps between each level of A and B like this:

   Class Min Max Overlap
1      A 500 630    TRUE
2      A 100 200    FALSE
3      B 100 200    FALSE
4      A 210 310    FALSE
5      A 200 210    FALSE
6      B 210 310    FALSE
7      A 510 530    TRUE
8      B 200 210    FALSE
9      A 705 800    FALSE
10     B 500 630    TRUE
11     B 510 530    TRUE
12     B 705 800    FALSE

Upvotes: 3

Answers (3)

Wimpel

Reputation: 27732

An other data.table approach.
The order of the sample data / ranges is irrelevant in this answer... foverlaps() does all the hard work for you.

sample data

library( data.table )
dt <- as.data.table( df_foo )

code

#set key for the data.table
setkey(dt, Min, Max)
#perform overlap join, keep only joined ranges where the class is the same, and Min and Max are not the same.
result <- foverlaps( dt, dt )[ Class == i.Class & !(Min == i.Min | Max == i.Max | Min == i.Max | Max == i.Min), ]
#create a logical vector (i.e. Overlap) by checking if the (pasted) combination of
#Class, Min and Max exists in both 'dt' and 'result'
dt[ , Overlap := paste0( Class, Min, Max ) %in% paste0( result$Class, result$Min, result$Max) ][]

#     Class Min Max Overlap
#  1:     A 100 200   FALSE
#  2:     B 100 200   FALSE
#  3:     A 200 210   FALSE
#  4:     B 200 210   FALSE
#  5:     A 210 310   FALSE
#  6:     B 210 310   FALSE
#  7:     A 500 630    TRUE
#  8:     B 500 630    TRUE
#  9:     A 510 530    TRUE
# 10:     B 510 530    TRUE
# 11:     A 705 800   FALSE
# 12:     B 705 800   FALSE

Upvotes: 1

boski

Reputation: 2467

with dplyr

df=df_foo%>%group_by(Class)%>%
  mutate(Overlap=if_else(Min<lag(Max,order_by=Class),TRUE,FALSE))
df$Overlap[which(df$Overlap==TRUE)-1]=TRUE
df$Overlap[which(is.na(df$Overlap))]=FALSE

> df
# A tibble: 12 x 4
# Groups:   Class [2]
   Class   Min   Max Overlap
   <fct> <dbl> <dbl> <lgl>  
 1 A       100   200 FALSE  
 2 A       200   210 FALSE  
 3 A       210   310 FALSE  
 4 A       500   630 TRUE   
 5 A       510   530 TRUE   
 6 A       705   800 FALSE  
 7 B       100   200 FALSE  
 8 B       200   210 FALSE  
 9 B       210   310 FALSE  
10 B       500   630 TRUE   
11 B       510   530 TRUE   
12 B       705   800 FALSE

This code assumes your values are in ascending order, as it only checks with the previous row.

edit
Not the prettiest but works.

df_foo$Class=as.character.factor(df_foo$Class)
df_foo=as.data.frame(df_foo)
df_foo$Overlap=rep("FALSE",nrow(df_foo))
for (i in 1:nrow(df_foo)){
  aux=FALSE
  class=df_foo$Class[i]
  df=df_foo[-i,]%>%filter(.,Class==class)
  for (j in 1:nrow(df)){
    if (df_foo[i,"Min"]<df[j,"Max"] & df_foo[i,"Max"] > df[j,"Min"]){
      aux=TRUE
    }
  }
  df_foo[i,"Overlap"]=aux
}


> df_foo
   Class Min Max Overlap
1      A 500 630    TRUE
2      A 100 200   FALSE
3      B 100 200   FALSE
4      A 210 310   FALSE
5      A 200 210   FALSE
6      B 210 310   FALSE
7      A 510 530    TRUE
8      B 200 210   FALSE
9      A 705 800   FALSE
10     B 500 630    TRUE
11     B 510 530    TRUE
12     B 705 800   FALSE

There must be a way to do it with dplyr but I could not figure it out. What is going on is that it loops through each row of df_foo; it generates a dataframe with all other rows of the same group and compares if there is any overlap (min<max and max<min)

Upvotes: 1

denis

Reputation: 5673

I have the answer in data.table, the translation into dplyr should be straigtworfard. The idea is to create a vector of the previous cumulated maximum , per class :

df_foo <- setDT(df_foo)
df_foo[, shiftedmaxmax := c(NA,cummax(Max)[1:(.N-1)]),by = Class  ]


    Class Min Max shiftedmaxmax
 1:     A 100 200            NA
 2:     A 200 210           200
 3:     A 210 310           210
 4:     A 500 630           310
 5:     A 510 530           630
 6:     A 705 800           630
 7:     B 100 200            NA
 8:     B 200 210           200
 9:     B 210 310           210
10:     B 500 630           310
11:     B 510 530           630
12:     B 705 800           630

And to compare the minimum with the previous maximum of Maximum (so if it is inferior to this value, the current line is in the range of one previous line)

df_foo[,superposed := Min < shiftedmaxmax]

    Class Min Max shiftedmaxmax superposed
 1:     A 100 200            NA         NA
 2:     A 200 210           200      FALSE
 3:     A 210 310           210      FALSE
 4:     A 500 630           310      FALSE
 5:     A 510 530           630       TRUE
 6:     A 705 800           630      FALSE
 7:     B 100 200            NA         NA
 8:     B 200 210           200      FALSE
 9:     B 210 310           210      FALSE
10:     B 500 630           310      FALSE
11:     B 510 530           630       TRUE
12:     B 705 800           630      FALSE

The first superposed is missing, and you can get it this way:

df_foo[,superposedsource :=  Max %in% shiftedmaxmax[superposed],by = Class]
df_foo[,superposedtot := ifelse((superposed | superposedsource) &,T,F)]

    Class Min Max shiftedmaxmax superposed superposedsource superposedtot
 1:     A 100 200            NA         NA            FALSE            NA
 2:     A 200 210           200      FALSE            FALSE         FALSE
 3:     A 210 310           210      FALSE            FALSE         FALSE
 4:     A 500 630           310      FALSE             TRUE          TRUE
 5:     A 510 530           630       TRUE            FALSE          TRUE
 6:     A 705 800           630      FALSE            FALSE         FALSE
 7:     B 100 200            NA         NA            FALSE            NA
 8:     B 200 210           200      FALSE            FALSE         FALSE
 9:     B 210 310           210      FALSE            FALSE         FALSE
10:     B 500 630           310      FALSE             TRUE          TRUE
11:     B 510 530           630       TRUE            FALSE          TRUE
12:     B 705 800           630      FALSE            FALSE         FALSE

Upvotes: 2

Find overlapping ranges by factor level using R

Answers (3)

Related Questions