Reputation: 6578
I'm trying to use ddply to find the smallest distance between two positions pos
where the corresponding chrom is the same in two dataframes:
head(bps, 10)
chrom pos iteration
1 1 4 1
2 1 14 1
3 1 68 1
4 1 79 1
5 1 200 1
6 1 205 1
7 1 270 1
8 1 304 1
9 2 7 1
10 2 13 1
head(flocs)
chrom pos
1 1 100
2 1 200
3 1 220
4 1 312
5 2 500
6 2 501
As an example, for the first line in bps
, I want to find the closest pos
in flocs
where chrom
= 1
, which gives a value of -96.
The pseudocode for what I'm trying to do is:
foreach iteration (bps$iteration):
foreach chrom (bps$chrom):
foreach pos (bps$pos):
features_pos = pos in dataframe flocs closest to pos on the same chromosome
min_dist = feature_pos - pos
return features_pos, min_dist
I am trying to do this with ddply:
minDists <- ddply(bp_data, c("chrom", "pos"), function(x) {
index <- which.min(abs(flocs$pos[which(flocs$chrom==x$chrom)] - x$pos))
closestMotif <- flocs$pos[index]
chrom <- as.character(flocs$chrom[index])
dist <- (x$pos - closestMotif)
data.frame(features_pos = closestMotif, pos = x$pos, min_dist = dist, feature = feature)
})
But this doesn't constrain comparisons to the same chromosome:
head(minDists, 10)
chrom features_pos pos min_dist feature
1 1 100 4 -96 feature1
2 1 100 14 -86 feature1
3 1 100 68 -32 feature1
4 1 100 79 -21 feature1
5 1 200 200 0 feature1
6 1 200 205 5 feature1
7 1 312 270 -42 feature1
8 1 312 304 -8 feature1
9 2 100 7 -93 feature1 # bps chrom=2, flocs chrom=1
10 2 100 13 -87 feature1 # bps chrom=2, flocs chrom=1
The expected output here is:
chrom features_pos pos min_dist feature
1 1 100 4 -96 feature1
2 1 100 14 -86 feature1
3 1 100 68 -32 feature1
4 1 100 79 -21 feature1
5 1 200 200 0 feature1
6 1 200 205 5 feature1
7 1 312 270 -42 feature1
8 1 312 304 -8 feature1
9 2 500 7 -493 feature1 # bp1 chrom=2, flocs chrom=2
10 2 500 13 -487 feature1 # bp1 chrom=2, flocs chrom=2
I thought that by providing the columns c("chrom", "pos")
essentially performed a group_by
to the function call.
Is there any way that I can improve what I've written to achieve the desired result?
bps <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), pos = c(4L, 14L, 68L, 79L, 200L, 205L,
270L, 304L, 7L, 13L, 23L, 39L, 100L, 150L, 17L, 55L, 75L, 79L,
102L, 109L, 123L, 155L, 157L, 200L, 260L, 299L, 300L, 320L, 323L,
345L, 450L, 550L), iteration = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor")), row.names = c(NA,
-32L), class = "data.frame")
flocs <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), pos = c(100L,
200L, 220L, 312L, 500L, 501L, 123L, 444L)), row.names = c(NA,
-8L), class = "data.frame")
Upvotes: 1
Views: 88
Reputation: 27792
data.table
approach using a rolling join...
updated answer
(initially forgot all about the by-reference joining, which is faster and most certainly shorter ;-) )
library( data.table )
#set data as data.table
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
output
# chrom pos iteration mindist
# 1: 1 4 1 -96
# 2: 1 14 1 -86
# 3: 1 68 1 -32
# 4: 1 79 1 -21
# 5: 1 200 1 0
# 6: 1 205 1 5
# 7: 1 270 1 -42
# 8: 1 304 1 -8
# 9: 2 7 1 -493
# 10: 2 13 1 -487
# 11: 2 23 1 -477
# 12: 2 39 1 -461
# 13: 2 100 1 -400
# 14: 2 150 1 -350
# 15: 3 17 1 -106
# 16: 3 55 1 -68
# 17: 3 75 1 -48
# 18: 3 79 1 -44
# 19: 3 102 1 -21
# 20: 3 109 1 -14
# 21: 3 123 1 0
# 22: 3 155 1 32
# 23: 3 157 1 34
# 24: 3 200 1 77
# 25: 3 260 1 137
# 26: 3 299 1 -145
# 27: 3 300 1 -144
# 28: 3 320 1 -124
# 29: 3 323 1 -121
# 30: 3 345 1 -99
# 31: 3 450 1 6
# 32: 3 550 1 106
# chrom pos iteration mindist
# Unit: milliseconds
# expr min lq mean median uq max neval
# Ronak_base 2.355879 2.555768 2.973069 2.626415 2.773581 8.016016 100
# Wimpel_data.table 1.697921 2.035788 2.416199 2.209616 2.361001 17.724528 100
# Pawel_tidyverse 14.845354 15.310505 16.333158 15.814819 16.541618 24.077871 100
microbenchmark::microbenchmark(
Ronak_base = {
bps$min_dist <- unlist(mapply(return_min_value, unique(bps$chrom), split(bps$pos, bps$chrom)))
},
Wimpel_data.table = {
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
},
Pawel_tidyverse = {
bps %>%
select(-iteration) %>%
unite('bps') %>%
crossing(flocs %>% unite('flocks')) %>%
separate(bps, c('chrom_bps', 'pos')) %>%
separate(flocks, c('chrom_flocks', 'features_pos')) %>%
filter(chrom_bps == chrom_flocks) %>%
select(-chrom_flocks) %>%
rename_at(1, ~'chrom') %>%
mutate_all(as.numeric) %>%
mutate(min_dist = pos - features_pos) %>%
group_by(chrom, pos) %>%
filter(abs(min_dist) == min(abs(min_dist)))
}
)
Looks like my data-table answer and the answer by Ronak Shah are pretty close together. I believe that data.table will gain the clear advantage when the data-sets are getting lager-huge (but I haven't tested)..
Upvotes: 4
Reputation: 2399
Check this solution:
library(tidyverse)
bps %>%
select(-iteration) %>%
unite('bps') %>%
crossing(flocs %>% unite('flocks')) %>%
separate(bps, c('chrom_bps', 'pos')) %>%
separate(flocks, c('chrom_flocks', 'features_pos')) %>%
filter(chrom_bps == chrom_flocks) %>%
select(-chrom_flocks) %>%
rename_at(1, ~'chrom') %>%
mutate_all(as.numeric) %>%
mutate(min_dist = pos - features_pos) %>%
group_by(chrom, pos) %>%
filter(abs(min_dist) == min(abs(min_dist)))
Output:
chrom pos features_pos min_dist
<dbl> <dbl> <dbl> <dbl>
1 1 4 100 -96
2 1 14 100 -86
3 1 68 100 -32
4 1 79 100 -21
5 1 200 200 0
6 1 205 200 5
7 1 270 312 -42
8 1 304 312 -8
9 2 7 500 -493
10 2 13 500 -487
# ... with 22 more rows
Upvotes: 0
Reputation: 389325
My base R attempt by creating a helper function (return_min_value
). This function subset flocs
based on current chrom
and then returns the minimum value after subtracting it from pos
. We split
the pos
column based on chrom
and pass these values along with unique
chrom
values in mapply
to return_min_value
function.
return_min_value <- function(x, y) {
sapply(y, function(p) {
vals = p - flocs$pos[flocs$chrom == x]
vals[which.min(abs(vals))]
})
}
bps$min_dist <- unlist(mapply(return_min_value,
unique(bps$chrom), split(bps$pos, bps$chrom)))
bps
# chrom pos iteration min_dist
#1 1 4 1 -96
#2 1 14 1 -86
#3 1 68 1 -32
#4 1 79 1 -21
#5 1 200 1 0
#6 1 205 1 5
#7 1 270 1 -42
#8 1 304 1 -8
#9 2 7 1 -493
#10 2 13 1 -487
#...
Upvotes: 2