Calculate n-dimensional euclidean distance from group centroids for each sample and select the lowest 3 for each group in R

This is two-part question and is pretty complex.

First. I want to calculate the 'n'-dimensional euclidean distance between each individual sample in dataframe "ind_scores" and it's respective group centroid in dataframe "centroids".

Then, I want to select the 3 individual samples are the closest to their respective group centroids. I want to save these to a new dataframe with the info for 'individual sample name', 'group' and 'distance to centroid'.

Here is an example of the data:

ind_scores <- data.frame(row.names = c("OP2413iiiaMOU","OP2413iiibMOU","OP2413iiicMOU","OP2645ii_aPOR","OP2645ii_bPOR","OP2645ii_cPOR","OP2645ii_dPOR","OP2645ii_ePOR","OP3088i__aPOR","OP5043___aWAT","OP5043___bWAT","OP5044___aMOU","OP5044___bMOU","OP5044___cMOU","OP5046___aWAT","OP5046___bWAT","OP5046___cWAT","OP5046___dWAT","OP5046___eWAT","OP5047___aPHA","OP5047___bPHA","OP5048___bPHA","OP5048___cPHA","OP5048___dPHA","OP5048___ePHA","OP5048___fPHA","OP5048___gPHA","OP5048___hPHA","OP5049___aWAT","OP5049___bWAT","OP5051DNAaCOM","OP5051DNAbCOM","OP5051DNAcCOM","OP5052DNAaWAT","OP5053DNAaPHA","OP5053DNAbPHA","OP5053DNAcPHA","OP5054DNAaMOU","OP5054DNAbMOU","OP5054DNAcMOU"),
                        group = c("4","4","4","1","1","1","1","1","3","3","3","5","5","5","2","5","2","2","5","3","3","3","3","1","3","3","3","3","2","2","4","5","4","2","3","3","3","5","5","5"),
                        CV.1 = c(-13.3864612433581,-12.8079930877268,-12.8078461023615,11.609290941109,10.5489837203281,10.8802079446603,11.7559827821839,10.769027609963,2.93788199576291,5.14343682437333,1.1768471575429,-3.59878541566711,-3.69656648297924,-3.13205394000296,-1.88190759998412,-3.50181277277038,0.563858206656491,-1.38629942623866,-3.73771209413208,3.40039117982473,2.86962877144321,3.80869463338469,4.15722705333298,9.08529455175736,3.15497802125988,2.42193314853044,0.600699372070624,4.14515087614032,-3.3599436881205,-1.8893406509868,-13.355031250023,-4.10118631444206,-11.4911993949333,-1.55841778422586,2.91834267480086,1.58762181687645,3.08125993208779,-3.84248479288043,-3.60800082570682,-3.47369634755007),
                        CV.2 = c(-5.98931418061097,-6.48685652483353,-6.48781938591041,-5.4121748521578,-4.56051914391762,-5.14772881585026,-4.3883054106957,-3.06298578319138,0.25688954313487,1.01459325674394,1.47381593062751,5.11285501685872,6.32219277017476,4.93757903863915,-1.98974199849122,6.8029453586845,-4.47482073821288,-2.89353901685366,6.19654462202962,1.44791941276988,2.01950206487354,3.29347544821835,1.70411388918498,-3.36842394773708,0.843537649290457,1.53904192617335,-0.0653393231022099,2.43481086719558,-2.28081054006986,-1.12101221091068,-5.74678650527647,2.81164429296665,-4.7739502651084,-0.836323550526183,1.21550795042252,1.3943021883996,1.4814166592311,5.83324212843683,5.74898742272061,5.20153475667944),
                        CV.3 = c(-1.98030009996666,-0.130982057250324,-0.13182806033636,4.66419380929057,5.76073945060135,4.68132496125842,4.76343610149589,4.14550671815003,-4.32639082067268,-4.24665489024982,-4.41960026466873,3.48306980151309,3.33978102573513,5.7630709271421,1.72213262278476,3.4138699327986,-0.214011687254588,-1.35717946591182,3.99742433050098,-4.11899265115508,-4.850265219848,-4.56241597162798,-5.1673124571133,3.88620294769555,-7.55945071289283,-5.18624310325486,-2.64740221288213,-3.34585676732483,-0.146912983782168,0.183282683148834,0.341803164827804,3.08878325423758,0.402559648490399,-0.589462854225432,-4.66295564242554,-4.70902036477095,-3.15037329091412,4.46721009678144,4.19323467451728,5.20598542755799),
                        CV.4 = c(-1.85773720384766,-3.29816018270707,-3.29805035723744,-1.0463680864694,-0.164642808251456,-1.88434766843655,-2.76184052196793,-1.69491772471098,0.0194432918943446,0.900426089523736,-0.581953934607345,-0.230042890025999,-1.79667524325622,-2.45893275735924,6.71016957191989,1.8888359729478,5.48587185602468,7.45260127587355,-0.447573770298677,-1.61748546155154,-2.01415972868345,-1.50135791552696,-0.439840157186184,-1.26569596255966,-1.04297110114946,-1.59978271452128,-0.471298592990895,-0.466524983137062,6.36590517153234,6.62852590954231,-3.04695209017556,-0.936146169909344,-2.4145719914164,5.10804058988218,-0.0744344020096521,-1.17738342385673,-1.67635978290671,-1.05954691377259,-0.0467102661118772,1.81264507750015))


centroids <- data.frame(group = c("1","2","3","4","5"),
                        CV.1 = c(10.7747979250003,-1.58534182381657,2.95743524695937,-12.7697062156805,-3.63247766512568),
                        CV.2 = c(-4.32335632559164,-2.26604134251075,1.43239910451168,-5.89694537234795,5.44083615635448),
                        CV.3 = c(4.65023399808197,-0.0670252808734024,-4.49663816927149,-0.299749480847027,4.1058254967538),
                        CV.4 = c(-1.469635462066,6.29185239579583,-0.838834486907799,-2.78309436507683,-0.363794106698444))

Many thanks in advance! Cheers. Deon.

Upvotes: 0

Answers (2)

JacobJacox

Reputation: 947

Bas's version is prettier and easier to understand, but if you were thinking to translate it to Rcpp (personally i work a lot with clustering and for big data R is a bit too slow), this might also help.

groups <- ind_scores[,1] %>% as.character() %>% as.numeric()
ind_scores[,1] <- NULL
centroids <- list()
j <- 1
for(i in unique(groups)){
  centroids[[j]] <- (ind_scores[groups==i,] %>% apply(.,2,mean))
  names(centroids)[j] <- i
  ind_scores <- rbind(ind_scores,centroids[[j]])
  j <- j +1
}
## the ast j of ind scores will be centroids


dist_mat <-  dist(ind_scores %>% as.matrix()) %>% as.matrix() ## get the distance matrix
# > dist_mat[1:5,1:5]
# OP2413iiiaMOU OP2413iiibMOU OP2413iiicMOU OP2645ii_aPOR OP2645ii_bPOR
# OP2413iiiaMOU      0.000000   2.465150984   2.464681274     25.882974     25.253460
# OP2413iiibMOU      2.465151   0.000000000   0.001294793     25.008458     24.367816
# OP2413iiicMOU      2.464681   0.001294793   0.000000000     25.008508     24.367942
# OP2645ii_aPOR     25.882974  25.008458155  25.008508380      0.000000      1.956891
# OP2645ii_bPOR     25.253460  24.367815962  24.367941651      1.956891      0.000000


## do not touch j
thresh <- 3
new_data_frame <- data.frame(sample_name=NA,group=NA,centr_distance=NA)
for(i in nrow(dist_mat):(nrow(dist_mat)-j+2)){
  distances_to_cluster <- dist_mat[i,-i]
  indexes <- order(distances_to_cluster,decreasing = F)[1:thresh]
  ## collect thresh minimum distances
  ##[1]  2  3 31
  for(z in indexes){
    ## get indexes name, which group and distance to centorid
    tmp <- c(rownames(dist_mat)[z],groups[z],distances_to_cluster[z])
    new_data_frame <- rbind(new_data_frame,tmp)
  }
}
new_data_frame[order(new_data_frame$group),] %>% na.omit()
# 11 OP2645ii_cPOR     1 0.929329939818952
# 12 OP2645ii_ePOR     1    1.376251766813
# 13 OP2645ii_aPOR     1   1.4357069775206
# 2  OP5049___bWAT     2  1.25678563440431
# 3  OP5049___aWAT     2  1.77800330839339
# 4  OP5046___dWAT     2  1.85612687904496
# 8  OP5053DNAaPHA     3 0.812735500386649
# 9  OP5047___aPHA     3 0.972298470684858
# 10 OP5048___fPHA     3  1.16307022957174
# 14 OP2413iiibMOU     4 0.802020132014482
# 15 OP2413iiicMOU     4 0.802473693143821
# 16 OP5051DNAaCOM     4 0.919980313623531
# 5  OP5054DNAbMOU     5 0.451374395540337
# 6  OP5044___aMOU     5 0.717231370935914
# 7  OP5046___eWAT     5 0.775202821859753

Upvotes: 1

Bas

Reputation: 4658

Personally, I like working with tidy tibbles (no more row-names, and in long-form), so I'll first convert your dataframes to that.

library(tidyverse)

ind_scores <- ind_scores %>% 
  as_tibble(rownames = "name") %>% 
  pivot_longer(cols = starts_with("CV"),
               names_to = "CV")

centroids <- centroids %>% 
  pivot_longer(cols = starts_with("CV"),
               names_to = "CV")

Now, it is easy to join the correct centroids to the individuals, group by individual, and calculate its euclidean distance. The resulting tibble contains columns name and distance. Sorting by distance gives the closest examples in the top.

ind_scores %>% 
  left_join(centroids, by = c("group", "CV"), suffix = c("", "_centroid")) %>% 
  group_by(group, name) %>% 
  summarise(distance = sqrt(sum((value - value_centroid)^2))) %>% # euclidean distance
  top_n(-3, distance) %>% # bottom 3 sorted by distance
  arrange(group, distance) # sort them

Upvotes: 2

Calculate n-dimensional euclidean distance from group centroids for each sample and select the lowest 3 for each group in R

Answers (2)

Related Questions