mikepl9
mikepl9

Reputation: 31

How can I continue Latin Hypercube Sampling in Python?

I have drawn 20 samples from a 2d parameter space, but I now want another 10. How can I continue sampling with LHS?

If I rerun the code then the Xtrain array would be different, since LHS is quasi-random. And I can't use a different array.

The following is the code that I used:

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from smt.sampling_methods import LHS
import pandas as pd

limits = np.array([[870, 1125], [1200, 9000]])
sampling = LHS(xlimits=limits)

train_size = 20
Xtrain = sampling(train_size)

Thank you!

Upvotes: 3

Views: 1335

Answers (2)

Curious Capybara
Curious Capybara

Reputation: 27

Most libraries do not offer the ability to continue sampling. Here is an extension I made to the Lhs class in scikit-optimize that allows you to include an array with existing samples as an input to the generate method.

This implementation only works if the number of new samples is a multiple of the old number of samples. This ensures that none of the previous samples will fall on the same column or line after those are redefined for the new sample number.

import numpy as np
from sklearn.utils import check_random_state
from scipy import spatial
from skopt.sampler.base import InitialPointGenerator
from skopt.space import Space, Categorical
from skopt.sampler import Lhs

def _random_permute_matrix(h, random_state=None):
    rng = check_random_state(random_state)
    h_rand_perm = np.zeros_like(h)
    samples, n = h.shape
    for j in range(n):
        order = rng.permutation(range(samples))
        h_rand_perm[:, j] = h[order, j]
    return h_rand_perm

class LHS_extendable(Lhs):
    def __init__(self, lhs_type="classic", criterion="maximin", iterations=1000):
        self.lhs_type = lhs_type
        self.criterion = criterion
        self.iterations = iterations
        
    def generate(self, dimensions, n_samples, existing_samples=None, random_state=None):
        rng = check_random_state(random_state)
        space = Space(dimensions)
        transformer = space.get_transformer()
        n_dim = space.n_dims
        space.set_transformer("normalize")
        if existing_samples is not None:
            existing_samples = space.transform(existing_samples)
        if self.criterion is None or n_samples == 1:
            h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
            h = space.inverse_transform(h)
            space.set_transformer(transformer)
            return h
        else:
            h_opt = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
            h_opt = space.inverse_transform(h_opt)
            if self.criterion == "correlation":
                mincorr = np.inf
                for i in range(self.iterations):
                    # Generate a random LHS
                    h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
                    r = np.corrcoef(np.array(h).T)
                    if len(np.abs(r[r != 1])) > 0 and \
                            np.max(np.abs(r[r != 1])) < mincorr:
                        mincorr = np.max(np.abs(r - np.eye(r.shape[0])))
                        h_opt = h.copy()
                        h_opt = space.inverse_transform(h_opt)
            elif self.criterion == "maximin":
                maxdist = 0
                # Maximize the minimum distance between points
                for i in range(self.iterations):
                    h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
                    d = spatial.distance.pdist(np.array(h), 'euclidean')
                    if maxdist < np.min(d):
                        maxdist = np.min(d)
                        h_opt = h.copy()
                        h_opt = space.inverse_transform(h_opt)
            elif self.criterion == "ratio":
                minratio = np.inf

                # Maximize the minimum distance between points
                for i in range(self.iterations):
                    h = self._lhs_normalized(n_dim, n_samples, existing_samples, rng)
                    p = spatial.distance.pdist(np.array(h), 'euclidean')
                    if np.min(p) == 0:
                        ratio = np.max(p) / 1e-8
                    else:
                        ratio = np.max(p) / np.min(p)
                    if minratio > ratio:
                        minratio = ratio
                        h_opt = h.copy()
                        h_opt = space.inverse_transform(h_opt)
            else:
                raise ValueError("Wrong criterion."
                                 "Got {}".format(self.criterion))
            space.set_transformer(transformer)
            return h_opt
        
    def _lhs_normalized(self, n_dim, n_samples, existing_samples, random_state):
        rng = check_random_state(random_state)
        x = np.linspace(0, 1, n_samples + 1)
        if existing_samples is None:
            u = rng.rand(n_samples, n_dim)
            h = np.zeros_like(u)
        else:
            u = rng.rand(n_samples-len(existing_samples), n_dim)
            h = np.zeros_like(u)
        
        if self.lhs_type == "centered":
            for j in range(n_dim):
                x_dim = x
                if existing_samples is not None:
                    existing_samples = np.array(existing_samples)
                    total_removed = 0
                    for old_sample in existing_samples:
                        #Find quadrant where old sample is located and remove that position
                        for i, position in enumerate(x_dim):
                            if old_sample[j]-position>=0 and old_sample[j]-position<np.diff(x_dim)[i]:
                                x_dim = np.delete(x_dim, i, 0)

                h[:, j] = np.diff(x)[0] / 2.0 + x_dim[:-1]
        elif self.lhs_type == "classic":
            for j in range(n_dim):
                x_dim = x
                if existing_samples is not None:
                    existing_samples = np.array(existing_samples)
                    total_removed = 0
                    for old_sample in existing_samples:
                        #Find quadrant where old sample is located and remove that position
                        for i, position in enumerate(x_dim):
                            if old_sample[j]-position>=0 and old_sample[j]-position<np.diff(x_dim)[i]:
                                x_dim = np.delete(x_dim, i, 0)
                h[:, j] = u[:, j] * np.diff(x)[0] + x_dim[:-1]
        else:
            raise ValueError("Wrong lhs_type. Got ".format(self.lhs_type))
            
        #Remove new samples in the same quadrant as old samples
        random_matrix = _random_permute_matrix(h, random_state=rng)
        
        if existing_samples is not None:
            random_matrix = np.concatenate((random_matrix, existing_samples), axis=0)
        return random_matrix

Some testing inspired by the comparison of initial sampling methods page from the scikit-optimize documentation.

import numpy as np
np.random.seed(123)
import matplotlib.pyplot as plt
from skopt.space import Space
from skopt.sampler import Sobol
from skopt.sampler import Lhs
from skopt.sampler import Halton
from skopt.sampler import Hammersly
from skopt.sampler import Grid
from scipy.spatial.distance import pdist

def plot_searchspace(x, title):
    fig, ax = plt.subplots()
    plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', label='samples')
    plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', markersize=80, alpha=0.5)
    # ax.legend(loc="best", numpoints=1)
    ax.set_xlabel("X1")
    ax.set_xlim([-5, 10])
    ax.set_ylabel("X2")
    ax.set_ylim([0, 15])
    plt.title(title)

n_samples = 5

space = Space([(-5., 10.), (0., 15.)])
# space.set_transformer("normalize")

lhs = LHS_extendable(criterion="maximin", iterations=10000)
x = lhs.generate(space.dimensions, n_samples)
pdist_data = []
x_label = []
plot_searchspace(x, 'maximin LHS')
pdist_data.append(pdist(x).flatten())
x_label.append("maximin")

n_samples = 10
x = lhs.generate(space.dimensions, n_samples, existing_samples = np.array(x))
pdist_data = []
x_label = []
plot_searchspace(x, 'maximin LHS')
pdist_data.append(pdist(x).flatten())
x_label.append("maximin")

As you can see in the resulting plots, the original 5 samples are maintained and extra 5 are generated according to the LHS criteria. Original 5 samples Extra 5 samples added

Upvotes: 1

Tae In Kim
Tae In Kim

Reputation: 224

The solution is random_state

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from smt.sampling_methods import LHS
import pandas as pd

limits = np.array([[870, 1125], [1200, 9000]])
sampling = LHS(xlimits=limits, random_state=42)

train_size = 20
Xtrain = sampling(train_size)

You can check the answer

Upvotes: 0

Related Questions