Reputation: 31
I have drawn 20 samples from a 2d parameter space, but I now want another 10. How can I continue sampling with LHS?
If I rerun the code then the Xtrain array would be different, since LHS is quasi-random. And I can't use a different array.
The following is the code that I used:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from smt.sampling_methods import LHS
import pandas as pd
limits = np.array([[870, 1125], [1200, 9000]])
sampling = LHS(xlimits=limits)
train_size = 20
Xtrain = sampling(train_size)
Thank you!
Upvotes: 3
Views: 1335
Reputation: 27
Most libraries do not offer the ability to continue sampling. Here is an extension I made to the Lhs class in scikit-optimize that allows you to include an array with existing samples as an input to the generate
method.
This implementation only works if the number of new samples is a multiple of the old number of samples. This ensures that none of the previous samples will fall on the same column or line after those are redefined for the new sample number.
import numpy as np
from sklearn.utils import check_random_state
from scipy import spatial
from skopt.sampler.base import InitialPointGenerator
from skopt.space import Space, Categorical
from skopt.sampler import Lhs
def _random_permute_matrix(h, random_state=None):
rng = check_random_state(random_state)
h_rand_perm = np.zeros_like(h)
samples, n = h.shape
for j in range(n):
order = rng.permutation(range(samples))
h_rand_perm[:, j] = h[order, j]
return h_rand_perm
class LHS_extendable(Lhs):
def __init__(self, lhs_type="classic", criterion="maximin", iterations=1000):
self.lhs_type = lhs_type
self.criterion = criterion
self.iterations = iterations
def generate(self, dimensions, n_samples, existing_samples=None, random_state=None):
rng = check_random_state(random_state)
space = Space(dimensions)
transformer = space.get_transformer()
n_dim = space.n_dims
space.set_transformer("normalize")
if existing_samples is not None:
existing_samples = space.transform(existing_samples)
if self.criterion is None or n_samples == 1:
h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
h = space.inverse_transform(h)
space.set_transformer(transformer)
return h
else:
h_opt = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
h_opt = space.inverse_transform(h_opt)
if self.criterion == "correlation":
mincorr = np.inf
for i in range(self.iterations):
# Generate a random LHS
h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
r = np.corrcoef(np.array(h).T)
if len(np.abs(r[r != 1])) > 0 and \
np.max(np.abs(r[r != 1])) < mincorr:
mincorr = np.max(np.abs(r - np.eye(r.shape[0])))
h_opt = h.copy()
h_opt = space.inverse_transform(h_opt)
elif self.criterion == "maximin":
maxdist = 0
# Maximize the minimum distance between points
for i in range(self.iterations):
h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
d = spatial.distance.pdist(np.array(h), 'euclidean')
if maxdist < np.min(d):
maxdist = np.min(d)
h_opt = h.copy()
h_opt = space.inverse_transform(h_opt)
elif self.criterion == "ratio":
minratio = np.inf
# Maximize the minimum distance between points
for i in range(self.iterations):
h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
p = spatial.distance.pdist(np.array(h), 'euclidean')
if np.min(p) == 0:
ratio = np.max(p) / 1e-8
else:
ratio = np.max(p) / np.min(p)
if minratio > ratio:
minratio = ratio
h_opt = h.copy()
h_opt = space.inverse_transform(h_opt)
else:
raise ValueError("Wrong criterion."
"Got {}".format(self.criterion))
space.set_transformer(transformer)
return h_opt
def _lhs_normalized(self, n_dim, n_samples, existing_samples, random_state):
rng = check_random_state(random_state)
x = np.linspace(0, 1, n_samples + 1)
if existing_samples is None:
u = rng.rand(n_samples, n_dim)
h = np.zeros_like(u)
else:
u = rng.rand(n_samples-len(existing_samples), n_dim)
h = np.zeros_like(u)
if self.lhs_type == "centered":
for j in range(n_dim):
x_dim = x
if existing_samples is not None:
existing_samples = np.array(existing_samples)
total_removed = 0
for old_sample in existing_samples:
#Find quadrant where old sample is located and remove that position
for i, position in enumerate(x_dim):
if old_sample[j]-position>=0 and old_sample[j]-position<np.diff(x_dim)[i]:
x_dim = np.delete(x_dim, i, 0)
h[:, j] = np.diff(x)[0] / 2.0 + x_dim[:-1]
elif self.lhs_type == "classic":
for j in range(n_dim):
x_dim = x
if existing_samples is not None:
existing_samples = np.array(existing_samples)
total_removed = 0
for old_sample in existing_samples:
#Find quadrant where old sample is located and remove that position
for i, position in enumerate(x_dim):
if old_sample[j]-position>=0 and old_sample[j]-position<np.diff(x_dim)[i]:
x_dim = np.delete(x_dim, i, 0)
h[:, j] = u[:, j] * np.diff(x)[0] + x_dim[:-1]
else:
raise ValueError("Wrong lhs_type. Got ".format(self.lhs_type))
#Remove new samples in the same quadrant as old samples
random_matrix = _random_permute_matrix(h, random_state=rng)
if existing_samples is not None:
random_matrix = np.concatenate((random_matrix, existing_samples), axis=0)
return random_matrix
Some testing inspired by the comparison of initial sampling methods page from the scikit-optimize documentation.
import numpy as np
np.random.seed(123)
import matplotlib.pyplot as plt
from skopt.space import Space
from skopt.sampler import Sobol
from skopt.sampler import Lhs
from skopt.sampler import Halton
from skopt.sampler import Hammersly
from skopt.sampler import Grid
from scipy.spatial.distance import pdist
def plot_searchspace(x, title):
fig, ax = plt.subplots()
plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', label='samples')
plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', markersize=80, alpha=0.5)
# ax.legend(loc="best", numpoints=1)
ax.set_xlabel("X1")
ax.set_xlim([-5, 10])
ax.set_ylabel("X2")
ax.set_ylim([0, 15])
plt.title(title)
n_samples = 5
space = Space([(-5., 10.), (0., 15.)])
# space.set_transformer("normalize")
lhs = LHS_extendable(criterion="maximin", iterations=10000)
x = lhs.generate(space.dimensions, n_samples)
pdist_data = []
x_label = []
plot_searchspace(x, 'maximin LHS')
pdist_data.append(pdist(x).flatten())
x_label.append("maximin")
n_samples = 10
x = lhs.generate(space.dimensions, n_samples, existing_samples = np.array(x))
pdist_data = []
x_label = []
plot_searchspace(x, 'maximin LHS')
pdist_data.append(pdist(x).flatten())
x_label.append("maximin")
As you can see in the resulting plots, the original 5 samples are maintained and extra 5 are generated according to the LHS criteria.
Upvotes: 1
Reputation: 224
The solution is random_state
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from smt.sampling_methods import LHS
import pandas as pd
limits = np.array([[870, 1125], [1200, 9000]])
sampling = LHS(xlimits=limits, random_state=42)
train_size = 20
Xtrain = sampling(train_size)
You can check the answer
Upvotes: 0