Reputation: 81
This is two-part question and is pretty complex.
First. I want to calculate the 'n'-dimensional euclidean distance between each individual sample in dataframe "ind_scores" and it's respective group centroid in dataframe "centroids".
Then, I want to select the 3 individual samples are the closest to their respective group centroids. I want to save these to a new dataframe with the info for 'individual sample name', 'group' and 'distance to centroid'.
Here is an example of the data:
ind_scores <- data.frame(row.names = c("OP2413iiiaMOU","OP2413iiibMOU","OP2413iiicMOU","OP2645ii_aPOR","OP2645ii_bPOR","OP2645ii_cPOR","OP2645ii_dPOR","OP2645ii_ePOR","OP3088i__aPOR","OP5043___aWAT","OP5043___bWAT","OP5044___aMOU","OP5044___bMOU","OP5044___cMOU","OP5046___aWAT","OP5046___bWAT","OP5046___cWAT","OP5046___dWAT","OP5046___eWAT","OP5047___aPHA","OP5047___bPHA","OP5048___bPHA","OP5048___cPHA","OP5048___dPHA","OP5048___ePHA","OP5048___fPHA","OP5048___gPHA","OP5048___hPHA","OP5049___aWAT","OP5049___bWAT","OP5051DNAaCOM","OP5051DNAbCOM","OP5051DNAcCOM","OP5052DNAaWAT","OP5053DNAaPHA","OP5053DNAbPHA","OP5053DNAcPHA","OP5054DNAaMOU","OP5054DNAbMOU","OP5054DNAcMOU"),
group = c("4","4","4","1","1","1","1","1","3","3","3","5","5","5","2","5","2","2","5","3","3","3","3","1","3","3","3","3","2","2","4","5","4","2","3","3","3","5","5","5"),
CV.1 = c(-13.3864612433581,-12.8079930877268,-12.8078461023615,11.609290941109,10.5489837203281,10.8802079446603,11.7559827821839,10.769027609963,2.93788199576291,5.14343682437333,1.1768471575429,-3.59878541566711,-3.69656648297924,-3.13205394000296,-1.88190759998412,-3.50181277277038,0.563858206656491,-1.38629942623866,-3.73771209413208,3.40039117982473,2.86962877144321,3.80869463338469,4.15722705333298,9.08529455175736,3.15497802125988,2.42193314853044,0.600699372070624,4.14515087614032,-3.3599436881205,-1.8893406509868,-13.355031250023,-4.10118631444206,-11.4911993949333,-1.55841778422586,2.91834267480086,1.58762181687645,3.08125993208779,-3.84248479288043,-3.60800082570682,-3.47369634755007),
CV.2 = c(-5.98931418061097,-6.48685652483353,-6.48781938591041,-5.4121748521578,-4.56051914391762,-5.14772881585026,-4.3883054106957,-3.06298578319138,0.25688954313487,1.01459325674394,1.47381593062751,5.11285501685872,6.32219277017476,4.93757903863915,-1.98974199849122,6.8029453586845,-4.47482073821288,-2.89353901685366,6.19654462202962,1.44791941276988,2.01950206487354,3.29347544821835,1.70411388918498,-3.36842394773708,0.843537649290457,1.53904192617335,-0.0653393231022099,2.43481086719558,-2.28081054006986,-1.12101221091068,-5.74678650527647,2.81164429296665,-4.7739502651084,-0.836323550526183,1.21550795042252,1.3943021883996,1.4814166592311,5.83324212843683,5.74898742272061,5.20153475667944),
CV.3 = c(-1.98030009996666,-0.130982057250324,-0.13182806033636,4.66419380929057,5.76073945060135,4.68132496125842,4.76343610149589,4.14550671815003,-4.32639082067268,-4.24665489024982,-4.41960026466873,3.48306980151309,3.33978102573513,5.7630709271421,1.72213262278476,3.4138699327986,-0.214011687254588,-1.35717946591182,3.99742433050098,-4.11899265115508,-4.850265219848,-4.56241597162798,-5.1673124571133,3.88620294769555,-7.55945071289283,-5.18624310325486,-2.64740221288213,-3.34585676732483,-0.146912983782168,0.183282683148834,0.341803164827804,3.08878325423758,0.402559648490399,-0.589462854225432,-4.66295564242554,-4.70902036477095,-3.15037329091412,4.46721009678144,4.19323467451728,5.20598542755799),
CV.4 = c(-1.85773720384766,-3.29816018270707,-3.29805035723744,-1.0463680864694,-0.164642808251456,-1.88434766843655,-2.76184052196793,-1.69491772471098,0.0194432918943446,0.900426089523736,-0.581953934607345,-0.230042890025999,-1.79667524325622,-2.45893275735924,6.71016957191989,1.8888359729478,5.48587185602468,7.45260127587355,-0.447573770298677,-1.61748546155154,-2.01415972868345,-1.50135791552696,-0.439840157186184,-1.26569596255966,-1.04297110114946,-1.59978271452128,-0.471298592990895,-0.466524983137062,6.36590517153234,6.62852590954231,-3.04695209017556,-0.936146169909344,-2.4145719914164,5.10804058988218,-0.0744344020096521,-1.17738342385673,-1.67635978290671,-1.05954691377259,-0.0467102661118772,1.81264507750015))
centroids <- data.frame(group = c("1","2","3","4","5"),
CV.1 = c(10.7747979250003,-1.58534182381657,2.95743524695937,-12.7697062156805,-3.63247766512568),
CV.2 = c(-4.32335632559164,-2.26604134251075,1.43239910451168,-5.89694537234795,5.44083615635448),
CV.3 = c(4.65023399808197,-0.0670252808734024,-4.49663816927149,-0.299749480847027,4.1058254967538),
CV.4 = c(-1.469635462066,6.29185239579583,-0.838834486907799,-2.78309436507683,-0.363794106698444))
Many thanks in advance! Cheers. Deon.
Upvotes: 0
Views: 450
Reputation: 947
Bas's version is prettier and easier to understand, but if you were thinking to translate it to Rcpp (personally i work a lot with clustering and for big data R is a bit too slow), this might also help.
groups <- ind_scores[,1] %>% as.character() %>% as.numeric()
ind_scores[,1] <- NULL
centroids <- list()
j <- 1
for(i in unique(groups)){
centroids[[j]] <- (ind_scores[groups==i,] %>% apply(.,2,mean))
names(centroids)[j] <- i
ind_scores <- rbind(ind_scores,centroids[[j]])
j <- j +1
}
## the ast j of ind scores will be centroids
dist_mat <- dist(ind_scores %>% as.matrix()) %>% as.matrix() ## get the distance matrix
# > dist_mat[1:5,1:5]
# OP2413iiiaMOU OP2413iiibMOU OP2413iiicMOU OP2645ii_aPOR OP2645ii_bPOR
# OP2413iiiaMOU 0.000000 2.465150984 2.464681274 25.882974 25.253460
# OP2413iiibMOU 2.465151 0.000000000 0.001294793 25.008458 24.367816
# OP2413iiicMOU 2.464681 0.001294793 0.000000000 25.008508 24.367942
# OP2645ii_aPOR 25.882974 25.008458155 25.008508380 0.000000 1.956891
# OP2645ii_bPOR 25.253460 24.367815962 24.367941651 1.956891 0.000000
## do not touch j
thresh <- 3
new_data_frame <- data.frame(sample_name=NA,group=NA,centr_distance=NA)
for(i in nrow(dist_mat):(nrow(dist_mat)-j+2)){
distances_to_cluster <- dist_mat[i,-i]
indexes <- order(distances_to_cluster,decreasing = F)[1:thresh]
## collect thresh minimum distances
##[1] 2 3 31
for(z in indexes){
## get indexes name, which group and distance to centorid
tmp <- c(rownames(dist_mat)[z],groups[z],distances_to_cluster[z])
new_data_frame <- rbind(new_data_frame,tmp)
}
}
new_data_frame[order(new_data_frame$group),] %>% na.omit()
# 11 OP2645ii_cPOR 1 0.929329939818952
# 12 OP2645ii_ePOR 1 1.376251766813
# 13 OP2645ii_aPOR 1 1.4357069775206
# 2 OP5049___bWAT 2 1.25678563440431
# 3 OP5049___aWAT 2 1.77800330839339
# 4 OP5046___dWAT 2 1.85612687904496
# 8 OP5053DNAaPHA 3 0.812735500386649
# 9 OP5047___aPHA 3 0.972298470684858
# 10 OP5048___fPHA 3 1.16307022957174
# 14 OP2413iiibMOU 4 0.802020132014482
# 15 OP2413iiicMOU 4 0.802473693143821
# 16 OP5051DNAaCOM 4 0.919980313623531
# 5 OP5054DNAbMOU 5 0.451374395540337
# 6 OP5044___aMOU 5 0.717231370935914
# 7 OP5046___eWAT 5 0.775202821859753
Upvotes: 1
Reputation: 4658
Personally, I like working with tidy tibbles (no more row-names, and in long-form), so I'll first convert your dataframes to that.
library(tidyverse)
ind_scores <- ind_scores %>%
as_tibble(rownames = "name") %>%
pivot_longer(cols = starts_with("CV"),
names_to = "CV")
centroids <- centroids %>%
pivot_longer(cols = starts_with("CV"),
names_to = "CV")
Now, it is easy to join the correct centroids to the individuals, group by individual, and calculate its euclidean distance. The resulting tibble contains columns name
and distance
. Sorting by distance
gives the closest examples in the top.
ind_scores %>%
left_join(centroids, by = c("group", "CV"), suffix = c("", "_centroid")) %>%
group_by(group, name) %>%
summarise(distance = sqrt(sum((value - value_centroid)^2))) %>% # euclidean distance
top_n(-3, distance) %>% # bottom 3 sorted by distance
arrange(group, distance) # sort them
Upvotes: 2