Reputation: 6578

ddply using "group_by" logic

I'm trying to use ddply to find the smallest distance between two positions pos where the corresponding chrom is the same in two dataframes:

head(bps, 10)
   chrom pos iteration
1      1   4         1
2      1  14         1
3      1  68         1
4      1  79         1
5      1 200         1
6      1 205         1
7      1 270         1
8      1 304         1
9      2   7         1
10     2  13         1

head(flocs)
  chrom pos
1     1 100
2     1 200
3     1 220
4     1 312
5     2 500
6     2 501

As an example, for the first line in bps, I want to find the closest pos in flocs where chrom = 1, which gives a value of -96.

The pseudocode for what I'm trying to do is:

foreach iteration (bps$iteration):
  foreach chrom (bps$chrom):
    foreach pos (bps$pos):
      features_pos = pos in dataframe flocs closest to pos on the same chromosome
      min_dist     = feature_pos - pos

      return features_pos, min_dist

I am trying to do this with ddply:

minDists <- ddply(bp_data, c("chrom", "pos"), function(x) {
    index <-  which.min(abs(flocs$pos[which(flocs$chrom==x$chrom)] - x$pos))
    closestMotif <- flocs$pos[index]
    chrom <- as.character(flocs$chrom[index])
    dist <- (x$pos - closestMotif)
    data.frame(features_pos = closestMotif, pos = x$pos, min_dist = dist, feature = feature)
  })

But this doesn't constrain comparisons to the same chromosome:

head(minDists, 10)

   chrom features_pos pos min_dist  feature
1      1          100   4      -96 feature1
2      1          100  14      -86 feature1
3      1          100  68      -32 feature1
4      1          100  79      -21 feature1
5      1          200 200        0 feature1
6      1          200 205        5 feature1
7      1          312 270      -42 feature1
8      1          312 304       -8 feature1
9      2          100   7      -93 feature1 # bps chrom=2, flocs chrom=1
10     2          100  13      -87 feature1 # bps chrom=2, flocs chrom=1

The expected output here is:

   chrom features_pos pos min_dist  feature
1      1          100   4      -96 feature1
2      1          100  14      -86 feature1
3      1          100  68      -32 feature1
4      1          100  79      -21 feature1
5      1          200 200        0 feature1
6      1          200 205        5 feature1
7      1          312 270      -42 feature1
8      1          312 304       -8 feature1
9      2          500   7      -493 feature1 # bp1 chrom=2, flocs chrom=2
10     2          500  13      -487 feature1 # bp1 chrom=2, flocs chrom=2

I thought that by providing the columns c("chrom", "pos") essentially performed a group_by to the function call.

Is there any way that I can improve what I've written to achieve the desired result?

bps <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), pos = c(4L, 14L, 68L, 79L, 200L, 205L, 
270L, 304L, 7L, 13L, 23L, 39L, 100L, 150L, 17L, 55L, 75L, 79L, 
102L, 109L, 123L, 155L, 157L, 200L, 260L, 299L, 300L, 320L, 323L, 
345L, 450L, 550L), iteration = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor")), row.names = c(NA, 
-32L), class = "data.frame")

flocs <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 
3L), .Label = c("1", "2", "3"), class = "factor"), pos = c(100L, 
200L, 220L, 312L, 500L, 501L, 123L, 444L)), row.names = c(NA, 
-8L), class = "data.frame")

Upvotes: 1

Answers (3)

Wimpel

Reputation: 27792

data.table approach using a rolling join...

updated answer

(initially forgot all about the by-reference joining, which is faster and most certainly shorter ;-) )

library( data.table )
#set data as data.table
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]

output

#     chrom pos iteration mindist
#  1:     1   4         1     -96
#  2:     1  14         1     -86
#  3:     1  68         1     -32
#  4:     1  79         1     -21
#  5:     1 200         1       0
#  6:     1 205         1       5
#  7:     1 270         1     -42
#  8:     1 304         1      -8
#  9:     2   7         1    -493
# 10:     2  13         1    -487
# 11:     2  23         1    -477
# 12:     2  39         1    -461
# 13:     2 100         1    -400
# 14:     2 150         1    -350
# 15:     3  17         1    -106
# 16:     3  55         1     -68
# 17:     3  75         1     -48
# 18:     3  79         1     -44
# 19:     3 102         1     -21
# 20:     3 109         1     -14
# 21:     3 123         1       0
# 22:     3 155         1      32
# 23:     3 157         1      34
# 24:     3 200         1      77
# 25:     3 260         1     137
# 26:     3 299         1    -145
# 27:     3 300         1    -144
# 28:     3 320         1    -124
# 29:     3 323         1    -121
# 30:     3 345         1     -99
# 31:     3 450         1       6
# 32:     3 550         1     106
#     chrom pos iteration mindist

Benchmarking answer until now

# Unit: milliseconds
#              expr       min        lq      mean    median        uq       max neval
#        Ronak_base  2.355879  2.555768  2.973069  2.626415  2.773581  8.016016   100
# Wimpel_data.table  1.697921  2.035788  2.416199  2.209616  2.361001 17.724528   100
#   Pawel_tidyverse 14.845354 15.310505 16.333158 15.814819 16.541618 24.077871   100


microbenchmark::microbenchmark(
  Ronak_base = {
    bps$min_dist <- unlist(mapply(return_min_value, unique(bps$chrom), split(bps$pos, bps$chrom)))
  },
  Wimpel_data.table = {
    setDT( bps, key = c("chrom", "pos") )
    setDT( flocs, key = c("chrom", "pos") )
    #perform by-reference rolling join
    bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
  },
  Pawel_tidyverse = {
    bps %>%
      select(-iteration) %>%
      unite('bps') %>%
      crossing(flocs %>% unite('flocks')) %>%
      separate(bps, c('chrom_bps', 'pos')) %>%
      separate(flocks, c('chrom_flocks', 'features_pos')) %>%
      filter(chrom_bps == chrom_flocks) %>%
      select(-chrom_flocks) %>%
      rename_at(1, ~'chrom') %>%
      mutate_all(as.numeric) %>%
      mutate(min_dist = pos - features_pos) %>%
      group_by(chrom, pos) %>%
      filter(abs(min_dist) == min(abs(min_dist)))
  }
)

Looks like my data-table answer and the answer by Ronak Shah are pretty close together. I believe that data.table will gain the clear advantage when the data-sets are getting lager-huge (but I haven't tested)..

Upvotes: 4

Paweł Chabros

Reputation: 2399

Check this solution:

library(tidyverse)

bps %>%
  select(-iteration) %>%
  unite('bps') %>%
  crossing(flocs %>% unite('flocks')) %>%
  separate(bps, c('chrom_bps', 'pos')) %>%
  separate(flocks, c('chrom_flocks', 'features_pos')) %>%
  filter(chrom_bps == chrom_flocks) %>%
  select(-chrom_flocks) %>%
  rename_at(1, ~'chrom') %>%
  mutate_all(as.numeric) %>%
  mutate(min_dist = pos - features_pos) %>%
  group_by(chrom, pos) %>%
  filter(abs(min_dist) == min(abs(min_dist)))

Output:

  chrom   pos features_pos min_dist
   <dbl> <dbl>        <dbl>    <dbl>
 1     1     4          100      -96
 2     1    14          100      -86
 3     1    68          100      -32
 4     1    79          100      -21
 5     1   200          200        0
 6     1   205          200        5
 7     1   270          312      -42
 8     1   304          312       -8
 9     2     7          500     -493
10     2    13          500     -487
# ... with 22 more rows

Upvotes: 0

Ronak Shah

Reputation: 389325

My base R attempt by creating a helper function (return_min_value). This function subset flocs based on current chrom and then returns the minimum value after subtracting it from pos. We split the pos column based on chrom and pass these values along with unique chrom values in mapply to return_min_value function.

return_min_value <- function(x, y) {
   sapply(y, function(p) {
     vals = p - flocs$pos[flocs$chrom == x]
     vals[which.min(abs(vals))]
  })
}

bps$min_dist <- unlist(mapply(return_min_value,
                       unique(bps$chrom), split(bps$pos, bps$chrom)))


bps
#   chrom pos iteration min_dist
#1      1   4         1      -96
#2      1  14         1      -86
#3      1  68         1      -32
#4      1  79         1      -21
#5      1 200         1        0
#6      1 205         1        5
#7      1 270         1      -42
#8      1 304         1       -8
#9      2   7         1     -493
#10     2  13         1     -487
#...

Upvotes: 2

ddply using &quot;group_by&quot; logic

Answers (3)

Benchmarking answer until now

Related Questions

ddply using "group_by" logic