Leo Barlach
Leo Barlach

Reputation: 490

R dist: calculating distance of few to many

Say you have some participants and control in a given experiment that are evaluated in three characteristics, something like this:

part_A <- c(3, 5, 4)
part_B <- c(12, 15, 18)
part_C <- c(50, 40, 45)

ctrl_1 <- c(4, 5, 5)
ctrl_2 <- c(1, 0, 4)
ctrl_3 <- c(13, 16, 17)
ctrl_4 <- c(28, 30, 35)
ctrl_5 <- c(51, 43, 44)

I want to find for each participant which control case is the closest match.

If I used the dist() function, I could get it, but it would take a lot of time also calculating the distances between controls, which is useless to me (and in the real data, there are 1000 times more control cases than participant cases).

Is there a way to ask for the distances between each of these elements to each of those elements? And something that work for very large data sets?

In the example above, the result I want is:

  Participant Closest_Ctrl
1      part_A       ctrl_1
2      part_B       ctrl_3
3      part_C       ctrl_5

Upvotes: 2

Views: 82

Answers (2)

IceCreamToucan
IceCreamToucan

Reputation: 28675

Convert input to data frames

parts <- do.call(data.frame, mget(ls(pattern = "part_[A-C]")))
ctrl <- do.call(data.frame, mget(ls(pattern = "ctrl_[1-5]")))

Generate output

# calculate distances
dists <- outer(parts, ctrl, Vectorize(function(x, y) sqrt(sum((x - y)^2))))

# generate output by calculating column with min value (max negative value)
data.frame(Participant = names(parts), 
           Closest_Ctrl = names(ctrl)[max.col(-dists)])

#   Participant Closest_Ctrl
# 1      part_A       ctrl_1
# 2      part_B       ctrl_3
# 3      part_C       ctrl_5

Benchmark

parts <- do.call(data.frame, mget(ls(pattern = "part_[A-C]")))
ctrl <- do.call(data.frame, mget(ls(pattern = "ctrl_[1-5]")))
parts <- do.call(cbind, replicate(100, parts, simplify = F))
ctrl <- do.call(cbind, replicate(100, ctrl, simplify = F))

r1 <- f1()
r2 <- f2()

all.equal(r1 %>% lapply(as.factor) %>% setNames(1:2), 
          r2[2:1] %>% lapply(as.factor) %>% setNames(1:2))
# [1] TRUE


f1 <- function(x){
  dists <- outer(parts, ctrl, Vectorize(function(x, y) sqrt(sum((x - y)^2))))
  # generate output by calculating column with min value (max negative value)
  data.frame(Participant = names(parts), 
             Closest_Ctrl = names(ctrl)[max.col(-dists)])
}

f2 <- function(x){
  res <- vapply(parts, function(x)  colnames(ctrl)[which.min(sqrt(colSums(x - ctrl)^2))], 
            FUN.VALUE = character(1))

  stack(res)
}

microbenchmark::microbenchmark(f1(), f2(), times = 5)        
# Unit: milliseconds
#  expr        min         lq       mean     median         uq        max neval
#  f1()   305.7324   314.8356   435.3961   324.6116   461.4788   770.3221     5
#  f2() 12359.6995 12831.7995 13567.8296 13616.5216 14244.0836 14787.0438     5

Benchmark 2

parts <- do.call(data.frame, mget(ls(pattern = "part_[A-C]")))
ctrl <- do.call(data.frame, mget(ls(pattern = "ctrl_[1-5]")))
parts <- do.call(cbind, replicate(10, parts, simplify = F))
ctrl <- do.call(cbind, replicate(10*1000, ctrl, simplify = F))

r1 <- f1()
r2 <- f2()

all.equal(r1 %>% lapply(as.factor) %>% setNames(1:2), 
          r2[2:1] %>% lapply(as.factor) %>% setNames(1:2))
# [1] TRUE


f1 <- function(x){
  dists <- outer(parts, ctrl, Vectorize(function(x, y) sqrt(sum((x - y)^2))))
  # generate output by calculating column with min value (max negative value)
  data.frame(Participant = names(parts), 
             Closest_Ctrl = names(ctrl)[max.col(-dists)])
}

f2 <- function(x){
  res <- vapply(parts, function(x)  colnames(ctrl)[which.min(sqrt(colSums(x - ctrl)^2))], 
            FUN.VALUE = character(1))

  stack(res)
}

microbenchmark::microbenchmark(f1(), f2(), times = 5)        
# Unit: seconds
#  expr        min         lq       mean     median         uq        max neval
#  f1()   3.450176   4.211997   4.493805   4.339818   5.154191   5.312844     5
#  f2() 119.120484 124.280423 132.637003 130.858727 131.148630 157.776749     5

Upvotes: 1

Roland
Roland

Reputation: 132706

Here is a solution that should be sufficiently fast for a not-too-big number of participants:

ctrl <- do.call(cbind, mget(ls(pattern = "ctrl_\\d+")))

dat <- mget(ls(pattern = "part_[[:upper:]+]"))

res <- vapply(dat, function(x)  colnames(ctrl)[which.min(sqrt(colSums(x - ctrl)^2))], 
                FUN.VALUE = character(1))

stack(res)
#  values    ind
#1 ctrl_1 part_A
#2 ctrl_3 part_B
#3 ctrl_5 part_C

If this is too slow I would quickly code it in Rcpp.

Upvotes: 3

Related Questions