How to have multiple categorical markers on a scatterplot

I want to train logistic regression model, and after that create a plot which shows boundary lines, but in specific way.

My work so far

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from matplotlib.colors import ListedColormap

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

logreg = LogisticRegression(C=1e5)

# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))

plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points

plt.scatter(X[:, 0], X[:,1], c=Y, marker='x',edgecolors='k', cmap=cmap_bold)
plt.xlabel('Sepal length'),
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

However I find it very unreadable. I want to have other markers for each classification and legend in left upper corner. Just like in the image below :

Do you have any idea how can I change that ? I played with marker ='s', marker='x', but those change all points on scatter plot, instead of one specific classification.

Upvotes: 3

Answers (3)

Trenton McKinney

Reputation: 62463

I find it easier to create a dataframe from X & Y, and then plot the data points with seaborn.scatterplot.
- seaborn is a high-level api for matplotlib
- As shown in How to extract the boundary values from k-nearest neighbors predict, the dataframe columns can be used to specify all data for fitting, and x and y min and max.

load and setup the data

import numpy as np
import matplotlib.pyplot as plt  # version 3.3.1
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from matplotlib.colors import ListedColormap
import seaborn  # versuin 0.11.0
import pandas  # version 1.1.3

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# seaborn.scatterplot palette parameter takes a list
palette = ['#FF0000', '#00FF00', '#0000FF']

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

# add X & Y to dataframe
df = pd.DataFrame(X, columns=iris.feature_names[:2])
df['label'] = Y
# map the number values to the species name and add it to the dataframe
species_map = dict(zip(range(3), iris.target_names))
df['species'] = df.label.map(species_map)

logreg = LogisticRegression(C=1e5)

# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

plot the data

plt.figure(1, figsize=(8, 6))

plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
# Plot also the training points

# add data points using seaborn
sns.scatterplot(data=df, x='sepal length (cm)', y='sepal width (cm)', hue='species',
                style='species', edgecolor='k', alpha=0.5, palette=palette, s=70)

# change legend location
plt.legend(title='Species', loc=2)

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
# plt.xticks(())
# plt.yticks(())

plt.show()

alpha=0.5 is used with sns.scatterplot, to show that some values of 'versicolor' and 'virginica' overlap.
If the species label is desired for the legend, instead of the name, change hue='species' to hue='label'.

Upvotes: 2

darcamo

Reputation: 3493

You need to change a single call to plt.scatter to one call for each marker type, since matplotlib does not allow passing multiple marker types as it does with color.

The plot code becomes something like

# Put the result into a color plot
Z = Z.reshape(xx.shape)

plt.figure(1, figsize=(4, 3))

plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points

X0 = X[Y==0]
X1 = X[Y==1]
X2 = X[Y==2]
Y0 = Y[Y==0]
Y1 = Y[Y==1]
Y2 = Y[Y==2]

plt.scatter(X0[:, 0], X0[:,1], marker='s',color="red")
plt.scatter(X1[:, 0], X1[:,1], marker='x',color="blue")
plt.scatter(X2[:, 0], X2[:,1], marker='o',color="green")
plt.xlabel('Sepal length'),
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

where you individually set the marker type and color of each class. You can also create a list for the marker type and another for the color and use a loop.

Upvotes: 1

Quang Hoang

Reputation: 150785

Since you are plotting with categorical values, you can just plot each class separately:

# Replace this
# plt.scatter(X[:, 0], X[:,1], c=Y, marker='x',edgecolors='k', cmap=cmap_bold)
# with this

markers = 'sxo'
for m,i in zip(markers,np.unique(Y)):
    mask = Y==i
    plt.scatter(X[mask, 0], X[mask,1], c=cmap_bold.colors[i],
                marker=m,edgecolors='k', label=i)
plt.legend()

Output:

Upvotes: 3

How to have multiple categorical markers on a scatterplot

Answers (3)

load and setup the data

plot the data

Related Questions