Callie
Callie

Reputation: 343

Find Overlapping Regions between two dataframes

I have two dataframes as follows:

Data1:

chr19   45770502    45770503    5.26315789473684
chr19   45770513    45770514    3.17460317460317
chr19   45770516    45770517    6.56063618290259
chr19   45770526    45770527    7.3558648111332
chr19   45770538    45770539    5.81162324649299
chr19   45770539    45770540    0
chr19   45770541    45770542    6.85483870967742
chr19   47430080    47430081    0
chr19   47430099    47430100    0
chr19   47430113    47430114    0
chr19   47430127    47430128    0
chr19   47430164    47430165    0
chr19   47430166    47430167    0
chr19   47430175    47430176    0
chr19   47430187    47430188    0
chr19   47430189    47430190    0
chr19   47430191    47430192    0
chr19   47430196    47430197    0
chr19   47430205    47430206    0
chr19   47430208    47430209    0
chr19   47430211    47430212    0
chr19   47430222    47430223    0
chr19   47430228    47430229    0
chr7    23904987    23904988    0
chr7    23904990    23904991    0

Data2:

chr19   45770509    45777447    uc061acd.1  0   -   45770509    45777447    0   5   131,98,112,86,121,  0,1058,2131,4439,6817,
chr19   45770921    45772712    uc061ace.1  0   -   45771157    45772712    0   4   475,98,158,72,  0,646,849,1719,
chr19   45770981    45772504    uc061acf.1  0   +   45770981    45770981    0   3   98,186,199, 0,508,1324,
chr19   45770995    45772504    uc061acg.1  0   +   45770995    45770995    0   3   84,95,199,  0,594,1310,
chr19   45771012    45772504    uc061ach.1  0   +   45771012    45771012    0   3   67,86,199,  0,577,1293,
chr19   45771532    45775268    uc061aci.1  0   -   45771532    45771532    0   4   133,158,112,320,    0,238,1108,3416,
chr19   45774947    45777037    uc061acj.1  0   -   45774947    45774947    0   2   87,379, 0,1711,

I want to create an output where overlapping start and end positions from Data1 and Data2 are extracted and the values in column4 from Data1 are added together for the overlapping regions.

Output Example:

chr19 45770513 45770542 35

I want to sum values from column4 of Data1 where the start and end position overlap with Data2.

How can I create output in this format for every possible overlap with every change in chr?

Thanks in advance.

Upvotes: 3

Views: 479

Answers (2)

Prem
Prem

Reputation: 11955

If I understood the question correctly then you can try data.table approach

library(data.table)

#convert sample data into data table
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)

#identify rows of DT1 which fall under DT2's range (see 'pos_range' column) 
#In case of NA (i.e. not found) replace it with row_number so that proper summarisation happens at the end
DT1[DT2, pos_range := paste(V2, V3, sep = '-'), 
    on = .(col2 >= V2, col3 <= V3)][, .(col1, col2, col3, col4, pos_range)]
DT1[, pos_range := ifelse(is.na(pos_range), .I, pos_range)]

#summarise data
DT <- unique(DT1[, c("start_pos", "end_pos", "value_sum") := list(first(col2), last(col3), sum(col4)), 
                 .(col1, pos_range)][, .(col1, start_pos, end_pos, value_sum)])

Output is:

> DT
     col1 start_pos  end_pos value_sum
 1: chr19  45770502 45770503  5.263158
 2: chr19  45770513 45770542 29.757566
 3: chr19  47430080 47430081  0.000000
 4: chr19  47430099 47430100  0.000000
...



Update: In case you want to know overlapping rows only then you need to simply ignore NA in pos_range column of DT1

library(data.table)    
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)

DT <- DT1[DT2, pos_range := paste(V2, V3, sep = '-'), 
          on = .(col2 >= V2, col3 <= V3)][!is.na(pos_range), .(col1, col2, col3, col4, pos_range)]
DT <- unique(DT[, c("start_pos", "end_pos", "value_sum") := list(first(col2), last(col3), sum(col4)), 
                .(col1, pos_range)][, .(col1, start_pos, end_pos, value_sum)])
DT
#    col1 start_pos  end_pos value_sum
#1: chr19  45770513 45770542  29.75757

Sample data:

df1 <- structure(list(col1 = c("chr19", "chr19", "chr19", "chr19", "chr19", 
"chr19", "chr19", "chr19", "chr19", "chr19", "chr19", "chr19", 
"chr19", "chr19", "chr19", "chr19", "chr19", "chr19", "chr19", 
"chr19", "chr19", "chr19", "chr19", "chr7", "chr7"), col2 = c(45770502L, 
45770513L, 45770516L, 45770526L, 45770538L, 45770539L, 45770541L, 
47430080L, 47430099L, 47430113L, 47430127L, 47430164L, 47430166L, 
47430175L, 47430187L, 47430189L, 47430191L, 47430196L, 47430205L, 
47430208L, 47430211L, 47430222L, 47430228L, 23904987L, 23904990L
), col3 = c(45770503L, 45770514L, 45770517L, 45770527L, 45770539L, 
45770540L, 45770542L, 47430081L, 47430100L, 47430114L, 47430128L, 
47430165L, 47430167L, 47430176L, 47430188L, 47430190L, 47430192L, 
47430197L, 47430206L, 47430209L, 47430212L, 47430223L, 47430229L, 
23904988L, 23904991L), col4 = c(5.26315789473684, 3.17460317460317, 
6.56063618290259, 7.3558648111332, 5.81162324649299, 0, 6.85483870967742, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("col1", 
"col2", "col3", "col4"), class = "data.frame", row.names = c(NA, 
-25L))

df2 <- structure(list(V1 = c("chr19", "chr19", "chr19", "chr19", "chr19", 
"chr19", "chr19"), V2 = c(45770509L, 45770921L, 45770981L, 45770995L, 
45771012L, 45771532L, 45774947L), V3 = c(45777447L, 45772712L, 
45772504L, 45772504L, 45772504L, 45775268L, 45777037L), V4 = c("uc061acd.1", 
"uc061ace.1", "uc061acf.1", "uc061acg.1", "uc061ach.1", "uc061aci.1", 
"uc061acj.1"), V5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), V6 = c("-", 
"-", "+", "+", "+", "-", "-"), V7 = c(45770509L, 45771157L, 45770981L, 
45770995L, 45771012L, 45771532L, 45774947L), V8 = c(45777447L, 
45772712L, 45770981L, 45770995L, 45771012L, 45771532L, 45774947L
), V9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), V10 = c(5L, 4L, 3L, 3L, 
3L, 4L, 2L), V11 = c("131,98,112,86,121,", "475,98,158,72,", 
"98,186,199,", "84,95,199,", "67,86,199,", "133,158,112,320,", 
"87,379,"), V12 = c("0,1058,2131,4439,6817,", "0,646,849,1719,", 
"0,508,1324,", "0,594,1310,", "0,577,1293,", "0,238,1108,3416,", 
"0,1711,")), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", 
"V8", "V9", "V10", "V11", "V12"), class = "data.frame", row.names = c(NA, 
-7L))

Upvotes: 1

mt1022
mt1022

Reputation: 17289

You can try GenomicRanges from Bioconductor. Here is a not so clean solution:

suppressPackageStartupMessages(library(GenomicRanges))

gr1 <- GRanges(seqnames = dtt1$V1, ranges = IRanges(start = dtt1$V2, end = dtt1$V3), score = dtt1$V4)

gr2 <- GRanges(seqnames = dtt2$V1, ranges = IRanges(start = dtt2$V2, end = dtt2$V3))


x <- findOverlaps(gr1, gr2)

y <- lapply(split(gr1[queryHits(x)], subjectHits(x)), function(g){
    res <- reduce(g, min.gapwidth = max(end(g)) - min(start(g)))
    score(res) <- sum(score(g))
    res
})
as(y, 'GRangesList')

# GRangesList object of length 1:
# $1 
# GRanges object with 1 range and 1 metadata column:
#       seqnames               ranges strand |            score
#          <Rle>            <IRanges>  <Rle> |        <numeric>
#   [1]    chr19 [45770513, 45770542]      * | 29.7575661248094

You can also do this with non-equi join in data.table or command line tool bedtools.

Upvotes: 0

Related Questions