Reputation: 101
I want to plot a dendrogram plot for hierarchical clustering using plotly and show a small subset of the plot as with the large number of samples the plot can be very dense at the bottom.
I have plotted the plot using the plotly wrapper function create_dendrogram with the below code:
from scipy.cluster.hierarchy import linkage
import plotly.figure_factory as ff
fig = ff.create_dendrogram(test_df, linkagefun=lambda x: linkage(test_df, 'average', metric='euclidean'))
fig.update_layout(autosize=True, hovermode='closest')
fig.update_xaxes(mirror=False, showgrid=True, showline=False, showticklabels=False)
fig.update_yaxes(mirror=False, showgrid=True, showline=True)
fig.show()
And below is the plot using matplotlib which is used by default by the scipy library truncated to 4 levels for ease of interpretation:
from scipy.cluster.hierarchy import dendrogram,linkage
x = linkage(test_df,method='average')
dendrogram(x,truncate_mode='level',p=4)
plt.show()
As you can see the truncation is very useful to interpret large number of samples, how can i acheive this in plotly ?
Upvotes: 4
Views: 2072
Reputation: 11
To make it more dynamically you can pass **kwargs
to create_dendogram()
function. If you check the source code, you need to pass **kwargs
in multiple other places, both in _Dendogram
class and get_dendrogram_traces()
function.
If you don't want to mess with _dendogram.py
which located in the default directory, I advise you to copy whole file and create a new file (Lets say modified_dendogram.py
) in your current directory.
Then simply import that local file using from modified_dendogram import create_dendrogram
.
Now you can use all the arguments that scipy.cluster.hierarchy.dendrogram
supports.
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from collections import OrderedDict
from plotly import exceptions, optional_imports
from plotly.graph_objs import graph_objs
# Optional imports, may be None for users that only use our core functionality.
np = optional_imports.get_module("numpy")
scp = optional_imports.get_module("scipy")
sch = optional_imports.get_module("scipy.cluster.hierarchy")
scs = optional_imports.get_module("scipy.spatial")
def create_dendrogram(
X,
orientation="bottom",
labels=None,
colorscale=None,
distfun=None,
linkagefun=lambda x: sch.linkage(x, "complete"),
hovertext=None,
color_threshold=None,
**kwargs
):
"""
Function that returns a dendrogram Plotly figure object. This is a thin
wrapper around scipy.cluster.hierarchy.dendrogram.
See also https://dash.plot.ly/dash-bio/clustergram.
:param (ndarray) X: Matrix of observations as array of arrays
:param (str) orientation: 'top', 'right', 'bottom', or 'left'
:param (list) labels: List of axis category labels(observation labels)
:param (list) colorscale: Optional colorscale for the dendrogram tree.
Requires 8 colors to be specified, the 7th of
which is ignored. With scipy>=1.5.0, the 2nd, 3rd
and 6th are used twice as often as the others.
Given a shorter list, the missing values are
replaced with defaults and with a longer list the
extra values are ignored.
:param (function) distfun: Function to compute the pairwise distance from
the observations
:param (function) linkagefun: Function to compute the linkage matrix from
the pairwise distances
:param (list[list]) hovertext: List of hovertext for constituent traces of dendrogram
clusters
:param (double) color_threshold: Value at which the separation of clusters will be made
Example 1: Simple bottom oriented dendrogram
>>> from plotly.figure_factory import create_dendrogram
>>> import numpy as np
>>> X = np.random.rand(10,10)
>>> fig = create_dendrogram(X)
>>> fig.show()
Example 2: Dendrogram to put on the left of the heatmap
>>> from plotly.figure_factory import create_dendrogram
>>> import numpy as np
>>> X = np.random.rand(5,5)
>>> names = ['Jack', 'Oxana', 'John', 'Chelsea', 'Mark']
>>> dendro = create_dendrogram(X, orientation='right', labels=names)
>>> dendro.update_layout({'width':700, 'height':500}) # doctest: +SKIP
>>> dendro.show()
Example 3: Dendrogram with Pandas
>>> from plotly.figure_factory import create_dendrogram
>>> import numpy as np
>>> import pandas as pd
>>> Index= ['A','B','C','D','E','F','G','H','I','J']
>>> df = pd.DataFrame(abs(np.random.randn(10, 10)), index=Index)
>>> fig = create_dendrogram(df, labels=Index)
>>> fig.show()
"""
if not scp or not scs or not sch:
raise ImportError(
"FigureFactory.create_dendrogram requires scipy, \
scipy.spatial and scipy.hierarchy"
)
s = X.shape
if len(s) != 2:
exceptions.PlotlyError("X should be 2-dimensional array.")
if distfun is None:
distfun = scs.distance.pdist
dendrogram = _Dendrogram(
X,
orientation,
labels,
colorscale,
distfun=distfun,
linkagefun=linkagefun,
hovertext=hovertext,
color_threshold=color_threshold,
kwargs=kwargs
)
return graph_objs.Figure(data=dendrogram.data, layout=dendrogram.layout)
class _Dendrogram(object):
"""Refer to FigureFactory.create_dendrogram() for docstring."""
def __init__(
self,
X,
orientation="bottom",
labels=None,
colorscale=None,
width=np.inf,
height=np.inf,
xaxis="xaxis",
yaxis="yaxis",
distfun=None,
linkagefun=lambda x: sch.linkage(x, "complete"),
hovertext=None,
color_threshold=None,
kwargs=None
):
self.orientation = orientation
self.labels = labels
self.xaxis = xaxis
self.yaxis = yaxis
self.data = []
self.leaves = []
self.sign = {self.xaxis: 1, self.yaxis: 1}
self.layout = {self.xaxis: {}, self.yaxis: {}}
if self.orientation in ["left", "bottom"]:
self.sign[self.xaxis] = 1
else:
self.sign[self.xaxis] = -1
if self.orientation in ["right", "bottom"]:
self.sign[self.yaxis] = 1
else:
self.sign[self.yaxis] = -1
if distfun is None:
distfun = scs.distance.pdist
(dd_traces, xvals, yvals, ordered_labels, leaves) = self.get_dendrogram_traces(
X, colorscale, distfun, linkagefun, hovertext, color_threshold, kwargs
)
self.labels = ordered_labels
self.leaves = leaves
yvals_flat = yvals.flatten()
xvals_flat = xvals.flatten()
self.zero_vals = []
for i in range(len(yvals_flat)):
if yvals_flat[i] == 0.0 and xvals_flat[i] not in self.zero_vals:
self.zero_vals.append(xvals_flat[i])
if len(self.zero_vals) > len(yvals) + 1:
# If the length of zero_vals is larger than the length of yvals,
# it means that there are wrong vals because of the identicial samples.
# Three and more identicial samples will make the yvals of spliting
# center into 0 and it will accidentally take it as leaves.
l_border = int(min(self.zero_vals))
r_border = int(max(self.zero_vals))
correct_leaves_pos = range(
l_border, r_border + 1, int((r_border - l_border) / len(yvals))
)
# Regenerating the leaves pos from the self.zero_vals with equally intervals.
self.zero_vals = [v for v in correct_leaves_pos]
self.zero_vals.sort()
self.layout = self.set_figure_layout(width, height)
self.data = dd_traces
def get_color_dict(self, colorscale):
"""
Returns colorscale used for dendrogram tree clusters.
:param (list) colorscale: Colors to use for the plot in rgb format.
:rtype (dict): A dict of default colors mapped to the user colorscale.
"""
# These are the color codes returned for dendrograms
# We're replacing them with nicer colors
# This list is the colors that can be used by dendrogram, which were
# determined as the combination of the default above_threshold_color and
# the default color palette (see scipy/cluster/hierarchy.py)
d = {
"r": "red",
"g": "green",
"b": "blue",
"c": "cyan",
"m": "magenta",
"y": "yellow",
"k": "black",
# TODO: 'w' doesn't seem to be in the default color
# palette in scipy/cluster/hierarchy.py
"w": "white",
}
default_colors = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
if colorscale is None:
rgb_colorscale = [
"rgb(0,116,217)", # blue
"rgb(35,205,205)", # cyan
"rgb(61,153,112)", # green
"rgb(40,35,35)", # black
"rgb(133,20,75)", # magenta
"rgb(255,65,54)", # red
"rgb(255,255,255)", # white
"rgb(255,220,0)", # yellow
]
else:
rgb_colorscale = colorscale
for i in range(len(default_colors.keys())):
k = list(default_colors.keys())[i] # PY3 won't index keys
if i < len(rgb_colorscale):
default_colors[k] = rgb_colorscale[i]
# add support for cyclic format colors as introduced in scipy===1.5.0
# before this, the colors were named 'r', 'b', 'y' etc., now they are
# named 'C0', 'C1', etc. To keep the colors consistent regardless of the
# scipy version, we try as much as possible to map the new colors to the
# old colors
# this mapping was found by inpecting scipy/cluster/hierarchy.py (see
# comment above).
new_old_color_map = [
("C0", "b"),
("C1", "g"),
("C2", "r"),
("C3", "c"),
("C4", "m"),
("C5", "y"),
("C6", "k"),
("C7", "g"),
("C8", "r"),
("C9", "c"),
]
for nc, oc in new_old_color_map:
try:
default_colors[nc] = default_colors[oc]
except KeyError:
# it could happen that the old color isn't found (if a custom
# colorscale was specified), in this case we set it to an
# arbitrary default.
default_colors[nc] = "rgb(0,116,217)"
return default_colors
def set_axis_layout(self, axis_key):
"""
Sets and returns default axis object for dendrogram figure.
:param (str) axis_key: E.g., 'xaxis', 'xaxis1', 'yaxis', yaxis1', etc.
:rtype (dict): An axis_key dictionary with set parameters.
"""
axis_defaults = {
"type": "linear",
"ticks": "outside",
"mirror": "allticks",
"rangemode": "tozero",
"showticklabels": True,
"zeroline": False,
"showgrid": False,
"showline": True,
}
if len(self.labels) != 0:
axis_key_labels = self.xaxis
if self.orientation in ["left", "right"]:
axis_key_labels = self.yaxis
if axis_key_labels not in self.layout:
self.layout[axis_key_labels] = {}
self.layout[axis_key_labels]["tickvals"] = [
zv * self.sign[axis_key] for zv in self.zero_vals
]
self.layout[axis_key_labels]["ticktext"] = self.labels
self.layout[axis_key_labels]["tickmode"] = "array"
self.layout[axis_key].update(axis_defaults)
return self.layout[axis_key]
def set_figure_layout(self, width, height):
"""
Sets and returns default layout object for dendrogram figure.
"""
self.layout.update(
{
"showlegend": False,
"autosize": False,
"hovermode": "closest",
"width": width,
"height": height,
}
)
self.set_axis_layout(self.xaxis)
self.set_axis_layout(self.yaxis)
return self.layout
def get_dendrogram_traces(
self, X, colorscale, distfun, linkagefun, hovertext, color_threshold, kwargs={}
):
"""
Calculates all the elements needed for plotting a dendrogram.
:param (ndarray) X: Matrix of observations as array of arrays
:param (list) colorscale: Color scale for dendrogram tree clusters
:param (function) distfun: Function to compute the pairwise distance
from the observations
:param (function) linkagefun: Function to compute the linkage matrix
from the pairwise distances
:param (list) hovertext: List of hovertext for constituent traces of dendrogram
:rtype (tuple): Contains all the traces in the following order:
(a) trace_list: List of Plotly trace objects for dendrogram tree
(b) icoord: All X points of the dendrogram tree as array of arrays
with length 4
(c) dcoord: All Y points of the dendrogram tree as array of arrays
with length 4
(d) ordered_labels: leaf labels in the order they are going to
appear on the plot
(e) P['leaves']: left-to-right traversal of the leaves
"""
d = distfun(X)
Z = linkagefun(d)
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
**kwargs
)
icoord = scp.array(P["icoord"])
dcoord = scp.array(P["dcoord"])
ordered_labels = scp.array(P["ivl"])
color_list = scp.array(P["color_list"])
colors = self.get_color_dict(colorscale)
trace_list = []
for i in range(len(icoord)):
# xs and ys are arrays of 4 points that make up the '∩' shapes
# of the dendrogram tree
if self.orientation in ["top", "bottom"]:
xs = icoord[i]
else:
xs = dcoord[i]
if self.orientation in ["top", "bottom"]:
ys = dcoord[i]
else:
ys = icoord[i]
color_key = color_list[i]
hovertext_label = None
if hovertext:
hovertext_label = hovertext[i]
trace = dict(
type="scatter",
x=np.multiply(self.sign[self.xaxis], xs),
y=np.multiply(self.sign[self.yaxis], ys),
mode="lines",
marker=dict(color=colors[color_key]),
text=hovertext_label,
hoverinfo="text",
)
try:
x_index = int(self.xaxis[-1])
except ValueError:
x_index = ""
try:
y_index = int(self.yaxis[-1])
except ValueError:
y_index = ""
trace["xaxis"] = "x" + x_index
trace["yaxis"] = "y" + y_index
trace_list.append(trace)
return trace_list, icoord, dcoord, ordered_labels, P["leaves"]
from modified_dendogram import create_dendrogram
import numpy as np
np.random.seed(1)
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = create_dendrogram(X)
fig.update_layout(width=800, height=500)
fig.show()
from utils.modified_dendogram import create_dendrogram
import numpy as np
np.random.seed(1)
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = create_dendrogram(X, truncate_mode="level", p=1)
fig.update_layout(width=800, height=500)
fig.show()
Upvotes: 0
Reputation: 61104
There does not seem to be a straight-forward way to do this with ff.create_dendrogram()
. This does not mean it's impossible though. But I would at least consider the brilliant functionalities that Dash Clustergram has to offer. If you insist on sticking to ff.create_dendrogram()
, this is going to get a bit more messy than Plotly users rightfully have grown accustomed to. You haven't provided a data sample, so let's use the Plotly Basic Dendrogram
example instead:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(1)
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = ff.create_dendrogram(X)
fig.update_layout(width=800, height=500)
f = fig.full_figure_for_development(warn=False)
fig.show()
The good news is that the exact same snippet will produce the following truncated plot after we've taken a few steps that I'll explain in the details below.
If anyone who got this far in my answer knows a better way to do the following, then please share.
ff.create_dendrogram()
is a wrapper for scipy.cluster.hierarchy.dendrogram
You can call help(ff.create_dendrogram)
and learn that:
[...]This is a thin wrapper around scipy.cluster.hierarchy.dendrogram.
From the available arguments you can also see that none seem to handle anything related to truncating:
create_dendrogram(X, orientation='bottom', labels=None, colorscale=None, distfun=None, linkagefun=<function at 0x0000016F09D4CEE0>, hovertext=None, color_threshold=None)
scipy.cluster.hierarchy.dendrogram
Here we can see that some central elements have been left out after implementing the function in ff.create_dendrogram(X)
when we compare it to the source:
scipy.cluster.hierarchy.dendrogram(Z, p=30, truncate_mode=None, color_threshold=None, get_leaves=True, orientation='top', labels=None, count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False, no_labels=False, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None, show_contracted=False, link_color_func=None, ax=None, above_threshold_color='C0')
truncate_mode
should be exactly what we're looking for. So, now we know that scipy
probably has all we need to build the foundation for a truncated dendrogram, but what's next?
scipy.cluster.hierarchy.dendrogram
is hiding in ff.create_dendrogram(X)
ff.create_dendrogram.__code__
will reveal where the source code exists in your system. In my case this is:
"C:\Users\vestland\Miniconda3\envs\dashy\lib\site-packages\plotly\figure_factory\_dendrogram.py"
So if you would like you can take a closer look at the complete source in your corresponding folder. If you do, you'll see one particularly interesting section where some attributes that we have listed above are taken care of:
def get_dendrogram_traces(
self, X, colorscale, distfun, linkagefun, hovertext, color_threshold
):
"""
Calculates all the elements needed for plotting a dendrogram.
.
.
.
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
)
Here we are at the very core of the problem. And the first step to a complete answer to your question is simply to include truncate_mode
and p
in P
like this:
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
truncate_mode = 'level',
p = 2
)
And here's how you do that:
In Python, the term monkey patch only refers to dynamic modifications of a class or module at runtime, which means monkey patch is a piece of Python code that extends or modifies other code at runtime. And here's the essence of how you can do exactly that in our case:
import plotly.figure_factory._dendrogram as original_dendrogram
original_dendrogram._Dendrogram.get_dendrogram_traces = modified_dendrogram_traces
Where modified_dendrogram_traces
is the complete function definition of modified_dendrogram_traces()
with the amendments I've already mentioned. As well as a few imports that will be missing that otherwise are run when you call import plotly.figure_factory as ff
Enough details for now. Below is the whole thing. If this is something you can use, we could perhaps make the whole thing a bit more dynamical than hardcoding truncate_mode = 'level'
and p = 2
.
from scipy.cluster.hierarchy import linkage
import plotly.figure_factory as ff
import plotly.figure_factory._dendrogram as original_dendrogram
import numpy as np
def modified_dendrogram_traces(
self, X, colorscale, distfun, linkagefun, hovertext, color_threshold
):
"""
Calculates all the elements needed for plotting a dendrogram.
:param (ndarray) X: Matrix of observations as array of arrays
:param (list) colorscale: Color scale for dendrogram tree clusters
:param (function) distfun: Function to compute the pairwise distance
from the observations
:param (function) linkagefun: Function to compute the linkage matrix
from the pairwise distances
:param (list) hovertext: List of hovertext for constituent traces of dendrogram
:rtype (tuple): Contains all the traces in the following order:
(a) trace_list: List of Plotly trace objects for dendrogram tree
(b) icoord: All X points of the dendrogram tree as array of arrays
with length 4
(c) dcoord: All Y points of the dendrogram tree as array of arrays
with length 4
(d) ordered_labels: leaf labels in the order they are going to
appear on the plot
(e) P['leaves']: left-to-right traversal of the leaves
"""
import plotly
from plotly import exceptions, optional_imports
np = optional_imports.get_module("numpy")
scp = optional_imports.get_module("scipy")
sch = optional_imports.get_module("scipy.cluster.hierarchy")
scs = optional_imports.get_module("scipy.spatial")
sch = optional_imports.get_module("scipy.cluster.hierarchy")
d = distfun(X)
Z = linkagefun(d)
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
truncate_mode = 'level',
p = 2
)
icoord = scp.array(P["icoord"])
dcoord = scp.array(P["dcoord"])
ordered_labels = scp.array(P["ivl"])
color_list = scp.array(P["color_list"])
colors = self.get_color_dict(colorscale)
trace_list = []
for i in range(len(icoord)):
# xs and ys are arrays of 4 points that make up the '∩' shapes
# of the dendrogram tree
if self.orientation in ["top", "bottom"]:
xs = icoord[i]
else:
xs = dcoord[i]
if self.orientation in ["top", "bottom"]:
ys = dcoord[i]
else:
ys = icoord[i]
color_key = color_list[i]
hovertext_label = None
if hovertext:
hovertext_label = hovertext[i]
trace = dict(
type="scatter",
x=np.multiply(self.sign[self.xaxis], xs),
y=np.multiply(self.sign[self.yaxis], ys),
mode="lines",
marker=dict(color=colors[color_key]),
text=hovertext_label,
hoverinfo="text",
)
try:
x_index = int(self.xaxis[-1])
except ValueError:
x_index = ""
try:
y_index = int(self.yaxis[-1])
except ValueError:
y_index = ""
trace["xaxis"] = "x" + x_index
trace["yaxis"] = "y" + y_index
trace_list.append(trace)
return trace_list, icoord, dcoord, ordered_labels, P["leaves"]
original_dendrogram._Dendrogram.get_dendrogram_traces = modified_dendrogram_traces
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = ff.create_dendrogram(X)
fig.update_layout(width=800, height=500)
f = fig.full_figure_for_development(warn=False)
fig.show()
Upvotes: 5