Reputation: 343
I have two dataframes as follows:
Data1:
chr19 45770502 45770503 5.26315789473684
chr19 45770513 45770514 3.17460317460317
chr19 45770516 45770517 6.56063618290259
chr19 45770526 45770527 7.3558648111332
chr19 45770538 45770539 5.81162324649299
chr19 45770539 45770540 0
chr19 45770541 45770542 6.85483870967742
chr19 47430080 47430081 0
chr19 47430099 47430100 0
chr19 47430113 47430114 0
chr19 47430127 47430128 0
chr19 47430164 47430165 0
chr19 47430166 47430167 0
chr19 47430175 47430176 0
chr19 47430187 47430188 0
chr19 47430189 47430190 0
chr19 47430191 47430192 0
chr19 47430196 47430197 0
chr19 47430205 47430206 0
chr19 47430208 47430209 0
chr19 47430211 47430212 0
chr19 47430222 47430223 0
chr19 47430228 47430229 0
chr7 23904987 23904988 0
chr7 23904990 23904991 0
Data2:
chr19 45770509 45777447 uc061acd.1 0 - 45770509 45777447 0 5 131,98,112,86,121, 0,1058,2131,4439,6817,
chr19 45770921 45772712 uc061ace.1 0 - 45771157 45772712 0 4 475,98,158,72, 0,646,849,1719,
chr19 45770981 45772504 uc061acf.1 0 + 45770981 45770981 0 3 98,186,199, 0,508,1324,
chr19 45770995 45772504 uc061acg.1 0 + 45770995 45770995 0 3 84,95,199, 0,594,1310,
chr19 45771012 45772504 uc061ach.1 0 + 45771012 45771012 0 3 67,86,199, 0,577,1293,
chr19 45771532 45775268 uc061aci.1 0 - 45771532 45771532 0 4 133,158,112,320, 0,238,1108,3416,
chr19 45774947 45777037 uc061acj.1 0 - 45774947 45774947 0 2 87,379, 0,1711,
I want to create an output where overlapping start and end positions from Data1 and Data2 are extracted and the values in column4 from Data1 are added together for the overlapping regions.
Output Example:
chr19 45770513 45770542 35
I want to sum values from column4 of Data1 where the start and end position overlap with Data2.
How can I create output in this format for every possible overlap with every change in chr?
Thanks in advance.
Upvotes: 3
Views: 479
Reputation: 11955
If I understood the question correctly then you can try data.table
approach
library(data.table)
#convert sample data into data table
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)
#identify rows of DT1 which fall under DT2's range (see 'pos_range' column)
#In case of NA (i.e. not found) replace it with row_number so that proper summarisation happens at the end
DT1[DT2, pos_range := paste(V2, V3, sep = '-'),
on = .(col2 >= V2, col3 <= V3)][, .(col1, col2, col3, col4, pos_range)]
DT1[, pos_range := ifelse(is.na(pos_range), .I, pos_range)]
#summarise data
DT <- unique(DT1[, c("start_pos", "end_pos", "value_sum") := list(first(col2), last(col3), sum(col4)),
.(col1, pos_range)][, .(col1, start_pos, end_pos, value_sum)])
Output is:
> DT
col1 start_pos end_pos value_sum
1: chr19 45770502 45770503 5.263158
2: chr19 45770513 45770542 29.757566
3: chr19 47430080 47430081 0.000000
4: chr19 47430099 47430100 0.000000
...
Update: In case you want to know overlapping rows only then you need to simply ignore NA
in pos_range
column of DT1
library(data.table)
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)
DT <- DT1[DT2, pos_range := paste(V2, V3, sep = '-'),
on = .(col2 >= V2, col3 <= V3)][!is.na(pos_range), .(col1, col2, col3, col4, pos_range)]
DT <- unique(DT[, c("start_pos", "end_pos", "value_sum") := list(first(col2), last(col3), sum(col4)),
.(col1, pos_range)][, .(col1, start_pos, end_pos, value_sum)])
DT
# col1 start_pos end_pos value_sum
#1: chr19 45770513 45770542 29.75757
Sample data:
df1 <- structure(list(col1 = c("chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19", "chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19", "chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19", "chr19", "chr19", "chr7", "chr7"), col2 = c(45770502L,
45770513L, 45770516L, 45770526L, 45770538L, 45770539L, 45770541L,
47430080L, 47430099L, 47430113L, 47430127L, 47430164L, 47430166L,
47430175L, 47430187L, 47430189L, 47430191L, 47430196L, 47430205L,
47430208L, 47430211L, 47430222L, 47430228L, 23904987L, 23904990L
), col3 = c(45770503L, 45770514L, 45770517L, 45770527L, 45770539L,
45770540L, 45770542L, 47430081L, 47430100L, 47430114L, 47430128L,
47430165L, 47430167L, 47430176L, 47430188L, 47430190L, 47430192L,
47430197L, 47430206L, 47430209L, 47430212L, 47430223L, 47430229L,
23904988L, 23904991L), col4 = c(5.26315789473684, 3.17460317460317,
6.56063618290259, 7.3558648111332, 5.81162324649299, 0, 6.85483870967742,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("col1",
"col2", "col3", "col4"), class = "data.frame", row.names = c(NA,
-25L))
df2 <- structure(list(V1 = c("chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19"), V2 = c(45770509L, 45770921L, 45770981L, 45770995L,
45771012L, 45771532L, 45774947L), V3 = c(45777447L, 45772712L,
45772504L, 45772504L, 45772504L, 45775268L, 45777037L), V4 = c("uc061acd.1",
"uc061ace.1", "uc061acf.1", "uc061acg.1", "uc061ach.1", "uc061aci.1",
"uc061acj.1"), V5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), V6 = c("-",
"-", "+", "+", "+", "-", "-"), V7 = c(45770509L, 45771157L, 45770981L,
45770995L, 45771012L, 45771532L, 45774947L), V8 = c(45777447L,
45772712L, 45770981L, 45770995L, 45771012L, 45771532L, 45774947L
), V9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), V10 = c(5L, 4L, 3L, 3L,
3L, 4L, 2L), V11 = c("131,98,112,86,121,", "475,98,158,72,",
"98,186,199,", "84,95,199,", "67,86,199,", "133,158,112,320,",
"87,379,"), V12 = c("0,1058,2131,4439,6817,", "0,646,849,1719,",
"0,508,1324,", "0,594,1310,", "0,577,1293,", "0,238,1108,3416,",
"0,1711,")), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7",
"V8", "V9", "V10", "V11", "V12"), class = "data.frame", row.names = c(NA,
-7L))
Upvotes: 1
Reputation: 17289
You can try GenomicRanges
from Bioconductor. Here is a not so clean solution:
suppressPackageStartupMessages(library(GenomicRanges))
gr1 <- GRanges(seqnames = dtt1$V1, ranges = IRanges(start = dtt1$V2, end = dtt1$V3), score = dtt1$V4)
gr2 <- GRanges(seqnames = dtt2$V1, ranges = IRanges(start = dtt2$V2, end = dtt2$V3))
x <- findOverlaps(gr1, gr2)
y <- lapply(split(gr1[queryHits(x)], subjectHits(x)), function(g){
res <- reduce(g, min.gapwidth = max(end(g)) - min(start(g)))
score(res) <- sum(score(g))
res
})
as(y, 'GRangesList')
# GRangesList object of length 1:
# $1
# GRanges object with 1 range and 1 metadata column:
# seqnames ranges strand | score
# <Rle> <IRanges> <Rle> | <numeric>
# [1] chr19 [45770513, 45770542] * | 29.7575661248094
You can also do this with non-equi join in data.table
or command line tool bedtools
.
Upvotes: 0