Reputation: 218
I have a plot I make using a website called Revigo that provides an R script (included below) to create a plot like this:
I'm looking to see if it's possible to perform and visualize a clustering on top of these points in the same graph? Since this plot looks like one big circle I'm trying to see if there are any smaller groupings within it that I can highlight. I have a biology background so I'm not sure where to start with trying to get this visualization in the same plot. I have explored using hclust()
but I don't know the steps to bring clusters to be shown on top of this graph.
The code and data that gives the plot above is:
library( ggplot2 );
library( scales );
revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","plot_size","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0002376","immune system process",16.463,-0.302,-3.807, 3.455,-8.3307,0.995,0.000),
c("GO:0006928","movement of cell or subcellular component",10.987, 1.052, 1.113, 3.280,-8.8153,0.965,0.000),
c("GO:0007610","behavior", 3.254,-1.620, 0.960, 2.752,-4.0048,0.994,0.000),
c("GO:0008150","biological_process",100.000,-6.029,-0.499, 4.239,-8.7447,1.000,0.000),
c("GO:0009987","cellular process",90.329,-5.288, 0.130, 4.195,-10.0701,0.999,0.000),
c("GO:0010243","response to organonitrogen compound", 4.697, 6.870,-2.756, 2.911,-12.7100,0.865,0.000),
c("GO:0023052","signaling",36.613, 2.976,-6.718, 3.803,-7.1931,0.996,0.000),
c("GO:0032501","multicellular organismal process",41.143, 3.761,-0.715, 3.853,-17.2741,0.997,0.000),
c("GO:0032502","developmental process",33.982,-2.865,-0.660, 3.770,-14.4202,0.996,0.000),
c("GO:0034762","regulation of transmembrane transport", 2.452,-3.982, 3.261, 2.629,-13.4123,0.788,0.000),
c("GO:0040007","growth", 5.447, 0.651,-0.889, 2.975,-4.2140,0.995,0.000),
c("GO:0040011","locomotion", 9.452, 2.305,-0.865, 3.215,-8.1068,0.995,0.000),
c("GO:0050896","response to stimulus",49.302, 3.556,-5.122, 3.932,-17.7852,0.997,0.000),
c("GO:0051179","localization",36.018,-0.564,-0.041, 3.795,-8.9245,0.996,0.000),
c("GO:0051704","multi-organism process", 9.873, 1.677,-5.787, 3.234,-4.6925,0.995,0.000),
c("GO:0065007","biological regulation",67.069,-4.157, 0.147, 4.065,-15.5918,0.998,0.000),
c("GO:0007154","cell communication",36.705,-0.841,-2.282, 3.804,-8.3595,0.992,0.006),
c("GO:0008283","cell proliferation",11.321, 2.644, 1.154, 3.293,-4.5376,0.982,0.024),
c("GO:0042391","regulation of membrane potential", 2.204,-0.568,-6.816, 2.583,-10.6073,0.895,0.035),
c("GO:0043086","negative regulation of catalytic activity", 4.934, 4.164,-2.157, 2.932,-4.9172,0.880,0.039),
c("GO:0019216","regulation of lipid metabolic process", 1.685, 3.142, 6.777, 2.467,-6.3215,0.880,0.043),
c("GO:0044057","regulation of system process", 2.862,-6.608, 1.584, 2.696,-13.1469,0.868,0.046),
c("GO:0051093","negative regulation of developmental process", 4.720,-4.950,-3.614, 2.913,-9.9031,0.830,0.050),
c("GO:0006793","phosphorus metabolic process",18.702,-4.363, 6.140, 3.511,-6.8729,0.961,0.053),
c("GO:0008284","positive regulation of cell proliferation", 4.859,-0.261, 4.128, 2.926,-9.2882,0.832,0.055),
c("GO:0065008","regulation of biological quality",20.202,-0.644, 5.306, 3.544,-20.3125,0.920,0.057),
c("GO:0050865","regulation of cell activation", 3.174, 2.497,-4.613, 2.741,-4.7878,0.917,0.062),
c("GO:0065009","regulation of molecular function",17.288,-1.972, 4.374, 3.477,-17.1959,0.923,0.078),
c("GO:0048518","positive regulation of biological process",30.969, 0.597, 8.033, 3.730,-15.2993,0.895,0.095),
c("GO:0051128","regulation of cellular component organization",13.335, 1.445, 4.799, 3.364,-4.5003,0.899,0.104),
c("GO:0007169","transmembrane receptor protein tyrosine kinase signaling pathway", 3.976, 6.886, 4.332, 2.839,-7.5986,0.845,0.114),
c("GO:0032101","regulation of response to external stimulus", 3.976, 6.545, 2.631, 2.839,-9.0405,0.838,0.114),
c("GO:0048519","negative regulation of biological process",26.855,-0.542, 7.989, 3.668,-12.9469,0.898,0.134),
c("GO:0044238","primary metabolic process",60.306,-3.339, 6.565, 4.019,-4.5229,0.961,0.140),
c("GO:0042221","response to chemical",23.993, 7.702,-0.115, 3.619,-24.7399,0.923,0.160),
c("GO:0009628","response to abiotic stimulus", 6.705, 8.008, 1.525, 3.066,-11.1726,0.938,0.174),
c("GO:0007187","G-protein coupled receptor signaling pathway, coupled to cyclic nucleotide second messenger", 1.033, 5.463, 5.279, 2.255,-5.0942,0.874,0.186),
c("GO:0031099","regeneration", 1.016,-3.828,-4.295, 2.248,-4.6216,0.955,0.186),
c("GO:0009719","response to endogenous stimulus", 9.175, 6.984, 0.467, 3.202,-15.8069,0.935,0.188),
c("GO:0098657","import into cell", 0.415,-3.347, 2.843, 1.863,-4.7077,0.928,0.190),
c("GO:0051239","regulation of multicellular organismal process",15.268,-6.085, 3.408, 3.423,-18.1433,0.885,0.192),
c("GO:0007268","chemical synaptic transmission", 3.474, 5.008, 6.277, 2.780,-4.7471,0.914,0.194),
c("GO:0050794","regulation of cellular process",60.427, 0.909, 7.170, 4.020,-15.1986,0.875,0.198),
c("GO:0009605","response to external stimulus",12.043, 7.990, 0.686, 3.320,-7.7520,0.932,0.202),
c("GO:0019222","regulation of metabolic process",35.730,-0.798, 6.924, 3.792,-8.7399,0.888,0.217),
c("GO:0048589","developmental growth", 3.272,-5.765,-3.288, 2.754,-4.6289,0.942,0.220),
c("GO:0048729","tissue morphogenesis", 3.572,-4.454,-4.112, 2.792,-4.2055,0.942,0.223),
c("GO:0048732","gland development", 2.418,-5.936,-2.151, 2.623,-3.9830,0.930,0.224),
c("GO:0032879","regulation of localization",14.409,-3.921, 4.876, 3.398,-19.9830,0.839,0.231),
c("GO:0006950","response to stress",21.310, 7.271,-0.252, 3.567,-11.7905,0.925,0.241),
c("GO:0048583","regulation of response to stimulus",21.610, 7.157, 2.071, 3.574,-14.0640,0.849,0.242),
c("GO:0031323","regulation of cellular metabolic process",33.837, 1.983, 7.193, 3.768,-7.3665,0.804,0.256),
c("GO:0003008","system process",11.575,-6.863, 1.039, 3.303,-14.5229,0.965,0.259),
c("GO:0050789","regulation of biological process",63.456, 0.625, 7.037, 4.041,-16.1244,0.896,0.274),
c("GO:0051899","membrane depolarization", 0.565,-1.372,-6.528, 1.996,-4.2411,0.910,0.284),
c("GO:0061024","membrane organization", 7.299,-5.167, 4.037, 3.102,-4.4237,0.987,0.286),
c("GO:0035150","regulation of tube size", 0.675,-0.489,-6.505, 2.072,-5.1129,0.901,0.290),
c("GO:0009636","response to toxic substance", 1.264, 6.039,-4.714, 2.342,-5.3072,0.915,0.292),
c("GO:0061041","regulation of wound healing", 0.750, 5.568, 3.354, 2.117,-5.7696,0.869,0.296),
c("GO:0007165","signal transduction",33.618, 5.808, 3.554, 3.765,-20.2403,0.784,0.296),
c("GO:0010038","response to metal ion", 1.829, 6.463,-4.185, 2.502,-5.0768,0.911,0.307),
c("GO:0001101","response to acid chemical", 1.858, 6.221,-3.913, 2.509,-5.3072,0.911,0.308),
c("GO:0043066","negative regulation of apoptotic process", 4.749,-4.234,-2.719, 2.916,-5.6778,0.845,0.315),
c("GO:0042493","response to drug", 2.366, 6.792,-3.628, 2.614,-12.6716,0.908,0.319),
c("GO:0010035","response to inorganic substance", 2.816, 6.714,-3.232, 2.689,-6.8125,0.906,0.327),
c("GO:0048856","anatomical structure development",31.558,-4.976,-4.080, 3.738,-12.4157,0.940,0.339),
c("GO:0051716","cellular response to stimulus",40.358, 7.607, 0.187, 3.845,-21.1158,0.911,0.359),
c("GO:0055082","cellular chemical homeostasis", 4.108,-1.327,-6.764, 2.853,-9.7520,0.840,0.364),
c("GO:0007267","cell-cell signaling", 9.025, 5.287, 5.987, 3.195,-5.1175,0.925,0.366),
c("GO:0043408","regulation of MAPK cascade", 3.883, 5.837, 4.526, 2.829,-6.8794,0.689,0.371),
c("GO:0048522","positive regulation of cellular process",27.732,-1.038, 7.337, 3.682,-15.2518,0.798,0.376),
c("GO:0010817","regulation of hormone levels", 2.764,-0.848,-6.705, 2.681,-6.8697,0.892,0.376),
c("GO:0050878","regulation of body fluid levels", 2.862,-1.665,-6.559, 2.696,-4.2541,0.891,0.378),
c("GO:1901654","response to ketone", 1.039, 5.708,-4.850, 2.258,-5.2644,0.898,0.394));
one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$plot_size <- as.numeric( as.character(one.data$plot_size) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = plot_size), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = plot_size), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
p1 <- p1 + scale_size( range=c(5, 30)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
ex <- one.data [ one.data$dispensability < 0.15, ];
p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 );
p1 <- p1 + labs (y = "semantic space x", x = "semantic space y");
p1 <- p1 + theme(legend.key = element_blank()) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);
p1
Sorry the code is quite busy, it's what the website provides me with so it's my starting point - any guidance on libraries or anything at all I can use to figure out how to highlight smaller groupings in this ggplot would be appreciated.
Edit: I've also looked at k-means clustering to give me some groupings to try to visualize using -
xy <- dplyr::select(one.data, plot_X, plot_Y)
res.km <- kmeans(scale(xy), 3, nstart = 25)
A badly made and not accurate example of what I expect overlaid groupings to look like, but if I used k-means to identify 3 groups, the style/type of plot I'm then trying to view would look like:
I can see res.km <- kmeans(scale(xy), 3, nstart = 25)
gives each row cluster between 1-3 is there a way to highlight groups like these?
Upvotes: 3
Views: 1226
Reputation: 66870
ggforce
offers some grouping geoms that fit the bill:
one.data$cluster = res.km$cluster
#...all your code above...
p1 + ggforce::geom_mark_ellipse(aes(plot_X, plot_Y, group = cluster, label = cluster))
[the "label = cluster" part can be left out to get the ellipses alone.]
For more detailed delineation of groups, in this case you might also try ggforce::geom_mark_hull
which lets you draw a convex hull that more closely follows the contours of your clusters:
p1 + ggforce::geom_mark_hull(aes(plot_X, plot_Y, group = cluster, label = cluster), expand = unit(10, "mm"), radius = unit(10, "mm"), concavity = 1.8)
Upvotes: 8
Reputation: 4497
A way to group them is using different shape.
# Take the cluster from kmean result
one.data$cluster <- factor(res.km$cluster)
# define shape_manual to provided to scale_shape_manual
shape_manual <- c(21, 22, 23)
names(shape_manual) <- levels(one.data$cluster)
ggplot( data = one.data ) +
# mapping shape with cluster
# As shape 21-23 have both fill & colour I am using fill here
geom_point( aes( plot_X, plot_Y, fill = log10_p_value,
size = plot_size, shape = cluster),
alpha = I(0.6) ) + scale_size_area() +
geom_point( aes(plot_X, plot_Y, size = plot_size, shape = cluster),
fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area() +
scale_fill_gradientn( colours = c("blue", "green", "yellow", "red"),
limits = c( min(one.data$log10_p_value), 0) ) +
# Define the manual scales for shapes with shape_manual define earlier
scale_shape_manual(values = shape_manual) +
scale_size( range=c(5, 30)) + theme_bw() +
geom_text( data = ex, aes(plot_X, plot_Y, label = description),
colour = I(alpha("black", 0.85)), size = 3 ) +
labs (y = "semantic space x", x = "semantic space y") +
theme(legend.key = element_blank()) +
xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10) +
ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10)
Upvotes: 3