Reputation: 10735
I'm trying to obtain a vector telling me which rows in a data.frame (transcriptcoords)
chr start end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370
have start/end coordinates loosely contained (with a +/- 10 tolerance) in another data.frame (genecoords)
chr start end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887
To do so, I am looping with sapply over the row indeces of the first data.frame, matching the coordinates with any row in the second data.frame. I have a solution (described below), but it seems to be rather slow (around six seconds with a slice of 2000 rows):
user system elapsed
6.02 0.00 6.04
I'm trying to understand which parts of the sapply could be optimized. Is it the if/else block? Or the comparison lines (==, <=, >=)? Or more simply, is it an intrinsically slow algorithm?
Thank you! The code I generated is below:
load(url("http://www.giorgilab.org/stuff/data.rda"))
# Pre-vectorize the data frames
g0<-rownames(genecoords)
g1<-genecoords[,1]
g2<-as.integer(genecoords[,2])
g3<-as.integer(genecoords[,3])
t0<-rownames(transcriptcoords)
t1<-transcriptcoords[,1]
t2<-as.integer(transcriptcoords[,2])
t3<-as.integer(transcriptcoords[,3])
system.time(gs<-sapply(1:2000,function(i){
t<-t0[i]
chr<-t1[i]
start<-t2[i]
end<-t3[i]
# Find a match (loose boundaries +/- 10)
right1<-which(g1==chr)
right2<-which(g2<=start+10)
right3<-which(g3>=end-10)
right<-intersect(right3,intersect(right1,right2))
right<-g0[right]
if(length(right)==1){
g<-right
} else if(length(right)>1){
# Get the smallest match
heregenecoords<-genecoords[right,]
size<-apply(heregenecoords,1,function(x){abs(as.numeric(x[3])-as.numeric(x[2]))})
g<-names(which.min(size))
} else {
g<-t
}
return(g)
}
))
Upvotes: 4
Views: 546
Reputation: 46866
With your data
tx0 <- read.table(textConnection("chr start end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370"))
gene0 <- read.table(textConnection("chr start end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887"))
The GenomicRanges package in Bioconductor does this easily and efficiently (for millions of overlaps).
library(GenomicRanges)
tx <- with(tx0, GRanges(chr, IRanges(start, end)))
gene <- with(gene0, GRanges(chr, IRanges(start, end)))
## increase width by 10 on both sides of the center of the gene range
gene <- resize(gene, width=width(gene) + 20, fix="center")
## find overlaps of 'query' tx and 'subject' gene, where query is within subject
olaps <- findOverlaps(tx, gene, type="within")
Showing, e.g., that 'query' (tx) 1, 2, 3, and 4 are within 'subject' (gene) 1.
> findOverlaps(tx, gene, type="within")
Hits of length 6
queryLength: 6
subjectLength: 6
queryHits subjectHits
<integer> <integer>
1 1 1
2 2 1
3 3 1
4 4 1
5 5 2
6 6 2
and that gene 1 is overlapped by 4 transcripts, gene 2 by 2 transcripts.
> table(subjectHits(olaps))
1 2
4 2
See also this publication. Using the larger data set:
tx <- with(transcriptcoords, GRanges(V1, IRanges(V2, V3, names=rownames(tx0))))
gene <- with(genecoords, GRanges(V1, IRanges(V2, V3, names=rownames(gene0))))
with some timings:
system.time(gene <- resize(gene, width=width(gene) + 20, fix="center"))
## user system elapsed
## 0.056 0.000 0.057
system.time(findOverlaps(tx, gene, type="within"))
## user system elapsed
## 2.248 0.000 2.250
I think this is approximately the time for the data.table solution from @danas.zuokos with just 1000 transcripts
system.time({
dt <- genecoords[transcriptcoords, allow.cartesian = TRUE];
res <- dt[start <= start.1 + tol & end >= end.1 - tol,
list(gene = gene[which.min(size)]), by = transcript]
})
## user system elapsed
## 2.148 0.244 2.400
Upvotes: 2
Reputation: 4643
I'm using data.table
library.
rm(list = ls())
load(url("http://www.giorgilab.org/stuff/data.rda"))
library(data.table)
tol <- 10 # tolerance
id <- 1:2000 # you can comment this out, but make sure you have big RAM
Convert to data.table
format. Additionaly compute size (I'm not sure why you take abs
, isn't end always bigger than start?).
genecoords <- data.table(genecoords, keep.rownames = TRUE)
setnames(genecoords, c("gene", "chr", "start", "end"))
genecoords[, size := end - start]
transcriptcoords <- data.table(transcriptcoords, keep.rownames = TRUE)[id] # comment out [id] when running on all trascripts
setnames(transcriptcoords, c("transcript", "chr", "start", "end"))
And this gives the result.
setkeyv(genecoords, "chr")
setkeyv(transcriptcoords, "chr")
dt <- genecoords[transcriptcoords, allow.cartesian = TRUE]
res <- dt[start <= start.1 + tol & end >= end.1 - tol, list(gene = gene[which.min(size)]), by = transcript]
Aware that this will not include transcripts, which do not meet the condition (g<-t
in your code).
Upvotes: 1
Reputation: 12468
Ha! Martin beat me to it with a better answer. It's practically always better to use working tested code in a well established library rather than roll your own. Definitely use Martin's solution, not this one.
But, just for laughs, here's another way to do it.
First, make up some genes and transcripts:
gs = 1:10*500
genes = data.frame(start=gs, end=gs+400)
rownames(genes) = sprintf('g%05d', 1:nrow(genes))
ts = sample(1:max(genes$end), size=10)
transcripts = data.frame(start=ts, end=ts+60)
rownames(transcripts) = sprintf('t%05d', 1:nrow(transcripts))
We can vectorize the comparison using outer which applies a function to each combination of its two vector arguments.
overlaps = function(genes, transcripts, min_overlap=1) {
b1 = outer(genes$end, transcripts$start, min_overlap=min_overlap,
function(e,s,min_overlap) e-s+1>min_overlap)
b2 = outer(genes$start, transcripts$end, min_overlap=min_overlap,
function(s,e,min_overlap) e-s+1>min_overlap)
result = b1 & b2
rownames(result) = rownames(genes)
colnames(result) = rownames(transcripts)
return(result)
}
For our genes and transcripts, we might get something like:
> genes
start end
g00001 500 900
g00002 1000 1400
g00003 1500 1900
g00004 2000 2400
g00005 2500 2900
g00006 3000 3400
g00007 3500 3900
g00008 4000 4400
g00009 4500 4900
g00010 5000 5400
> transcripts
start end
t00001 535 595
t00002 2502 2562
t00003 4757 4817
t00004 3570 3630
t00005 3094 3154
t00006 1645 1705
t00007 5202 5262
t00008 13 73
t00009 788 848
t00010 4047 4107
o1 = overlaps(genes, transcripts, 1)
The result is a boolean matrix that tells you whether each transcript overlaps with each gene.
> o1
t00001 t00002 t00003 t00004 t00005 t00006 t00007 t00008 t00009 t00010
g00001 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
g00002 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
g00003 FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
g00004 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
g00005 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
g00006 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
g00007 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
g00008 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
g00009 FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
g00010 FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Upvotes: 1