Reputation: 21
I have two dataframes, x and y, both of which represent items and their respective locations (which are represented as integers). Dataframe x responds to Genes and their locations; dataframe y responds to Enhancers and their locations. For every Gene in dataframe x, I want to find the Enhancer in y whose location is closest. Here are the first five rows of both dataframes:
Gene: | Location:
----------------------------------
CORT | 10450031
LOC107985174 | 110639954
LOC105369199 | 120963648
CD1D | 158178030
EPHA2 | 16124337
Enhancer: | Location:
-----------------------------------------------------
genic|NC_000001.11:180541-181713 | 180541
genic|NC_000001.11:819650-823755 | 819650
genic|NC_000001.11:1290023-1294341 | 1290023
genic|NC_000001.11:2072541-2076498 | 2072541
genic|NC_000001.11:2132164-2134268 | 2132164
I have been using which.min()
like so: Enhancers[which.min(abs(x-Enhancers$location)),]
where x
corresponds to the given gene's location, which does appear to work, but it requires manually inputing the location of each individual gene. I am wondering if there is a way to accomplish this for all the genes at once. Any help would be appreciated, thank you.
Upvotes: 2
Views: 245
Reputation: 8863
This returns a vector of the matches:
l1=1e3;l2=2e3
x=data.frame(Gene=paste0(letters,1:l1),Location=round(runif(l1)*1e8))
y=data.frame(Enhancer=paste0(letters,1:l2),Location_enh=round(runif(l2)*1e8))
y[,1][sapply(x[,2],\(v)which.min(abs(y[,2]-v)))]
This adds the matches as a new column:
cbind(x,min=y[,1][sapply(x[,2],\(v)which.min(abs(y[,2]-v)))])
Benchmark (edit: added Biobase::matchpt
and an Rcpp option):
Rcpp::cppFunction("NumericVector xclosest(NumericVector x,NumericVector y){
int l1=x.length(),l2=y.length();
NumericVector out(l1);
for(int i=0;i<l1;i++){
double min=abs(x[i]-y[0]);
int minind=0;
for(int j=1;j<l2;j++)if(abs(x[i]-y[j])<min){min=abs(x[i]-y[j]);minind=j;}
out[i]=minind;
}
return out;
}")
l1=1e3;l2=2e3
x=data.frame(Gene=paste0(letters,1:l1),Location=round(runif(l1)*1e8))
y=data.frame(Enhancer=paste0(letters,1:l2),Location_enh=round(runif(l2)*1e8))
microbenchmark(times=10,
sapply=sapply(x[,2],\(v)which.min(abs(v-y[,2]))),
outer_with_apply=apply(abs(outer(x[,2],y[,2],`-`)),2,which.min),
outer_with_rowMins=Rfast::rowMins(abs(outer(x[,2],y[,2],`-`))),
tidyverse=expand_grid(x,y)%>%group_by(Gene)%>%slice_min(abs(Location-Location_enh))%>%ungroup(),
distance_left_join=fuzzyjoin::distance_left_join(x,y,by=c("Location"="Location_enh"),max_dist=Inf),
matchpt=matchpt(x$Location,y$Location_enh),
xclosest_cpp=xclosest(x$Location,y$Location_enh)
)
Unit: milliseconds
expr min lq mean median uq max neval
sapply 15.208904 15.293551 17.201444 15.981592 17.890364 22.120634 10
outer_with_apply 34.881275 57.996456 55.564075 59.004213 59.365812 66.721156 10
outer_with_rowMins 18.468577 18.702030 40.420237 29.428359 35.666042 162.829603 10
tidyverse 555.895686 572.182184 860.772970 739.746102 1111.044602 1525.000602 10
distance_left_join 11138.266735 12269.768424 13375.626662 13433.652742 14787.006978 15079.786089 10
matchpt 5.088065 5.103749 5.211020 5.229232 5.287886 5.399156 10
xclosest_cpp 2.199385 2.204919 2.341288 2.209788 2.220550 3.517052 10
Upvotes: 1
Reputation: 46856
Looks like genetic data. If you're already using Bioconductor packages, then Biobase::matchpt()
will do the trick quickly and efficiently (and in multiple dimensions).
Using the simple example from @seb09, we have
set.seed(2105)
x <- data.frame(Gene = letters[1:5], Location = 1:5)
y <- data.frame(Enhancer = letters[6:10], Location_enh = 5*runif(5))
index <- matchpt(x$Location, y$Location_enh)$index
index
## [1] 5 2 2 4 3
and (a little extra work to clean up some funky row names from cbind()
)
result <- cbind(x, y[index,,drop = FALSE])
rownames(result) <- NULL
result
## Gene Location Enhancer Location_enh
## 1 a 1 j 0.9555975
## 2 b 2 g 2.2874741
## 3 c 3 g 2.2874741
## 4 d 4 i 4.2017862
## 5 e 5 h 4.2954764
Using @nisetama's benchmark we have
l1 = 1e3
l2 = 2e3
x = data.frame(
gene = paste0(letters,1:l1),
Location = round(runif(l1)*1e8)
)
y = data.frame(
enhancer = paste0(letters,1:l2),
Location_enh = round(runif(l2)*1e8)
)
identical(
sapply(x[,2],\(v)which.min(abs(v-y[,2]))),
matchpt(x$Location, y$Location_enh)$index
)
## [1] TRUE
microbenchmark(
sapply = sapply(x[,2],\(v)which.min(abs(v-y[,2]))),
matchpt = matchpt(x$Location, y$Location_enh)
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## sapply 32.21390 33.19321 35.21607 33.35805 39.23552 41.05420 100
## matchpt 10.91162 10.93015 11.00443 11.01078 11.06492 11.19677 100
Upvotes: 3
Reputation: 31
You could expand a grid with both data sets, group by Gene and select the row with the minimum absolute difference in location for each Gene.
library(tidyr)
library(dplyr)
#>
#> Attache Paket: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(2105)
x <- data.frame(Gene = letters[1:5], Location = 1:5)
y <- data.frame(Enhancer = letters[6:10], Location_enh = 5*runif(5))
x
#> Gene Location
#> 1 a 1
#> 2 b 2
#> 3 c 3
#> 4 d 4
#> 5 e 5
y
#> Enhancer Location_enh
#> 1 f 1.2275958
#> 2 g 2.2874741
#> 3 h 4.2954764
#> 4 i 4.2017862
#> 5 j 0.9555975
expand_grid(x, y) %>%
group_by(Gene) %>%
slice_min(abs(Location - Location_enh)) %>%
ungroup()
#> # A tibble: 5 x 4
#> Gene Location Enhancer Location_enh
#> <chr> <int> <chr> <dbl>
#> 1 a 1 j 0.956
#> 2 b 2 g 2.29
#> 3 c 3 g 2.29
#> 4 d 4 i 4.20
#> 5 e 5 h 4.30
Created on 2022-08-15 by the reprex package (v2.0.1)
Upvotes: 3
Reputation: 19870
You can use distance_join
set of functions from fuzzyjoin
R package to join 2 tables by distance. You can provide max_dist
argument to show NA if there is no data within the specified range.
For this demo I've used the test data from @seb09's answer.
library(tidyverse)
library(fuzzyjoin)
set.seed(2105)
x <- data.frame(Gene = letters[1:5], Location = 1:5)
y <- data.frame(Enhancer = letters[6:10], Location_enh = 5*runif(5))
distance_left_join(x, y, by = c("Location" = "Location_enh"), max_dist = 1)
#> Gene Location Enhancer Location_enh
#> 1 a 1 f 1.2275958
#> 2 a 1 j 0.9555975
#> 3 b 2 f 1.2275958
#> 4 b 2 g 2.2874741
#> 5 c 3 g 2.2874741
#> 6 d 4 h 4.2954764
#> 7 d 4 i 4.2017862
#> 8 e 5 h 4.2954764
#> 9 e 5 i 4.2017862
Created on 2022-08-15 by the reprex package (v2.0.0)
If you need to select a single enhancer per gene with closest distance:
distance_left_join(x, y, by = c("Location" = "Location_enh"), max_dist = 1) %>%
group_by(Gene) %>%
slice_min(abs(Location - Location_enh))
Upvotes: 1
Reputation: 160417
This data doesn't show enough similarity to be interesting, but I think this method will work for what you need:
apply(abs(outer(df1$Location, df2$Location, `-`)), 1, which.min)
# [1] 5 5 5 5 5
This is saying that df1$Location
s are all closest to the 5th value of df2$Location
.
Data
df1 <- structure(list(Gene = c("CORT", "LOC107985174", "LOC105369199", "CD1D", "EPHA2"), Location = c(10450031, 110639954, 120963648, 158178030, 16124337)), row.names = c(NA, -5L), class = "data.frame")
df2 <- structure(list(Enhancer = c("NC_000001.11:180541-181713", "NC_000001.11:819650-823755", "NC_000001.11:1290023-1294341", "NC_000001.11:2072541-2076498", "NC_000001.11:2132164-2134268"), Location = c(180541, 819650, 1290023, 2072541, 2132164)), row.names = c(NA, -5L), class = "data.frame")
Upvotes: 1