Nate Waltz
Nate Waltz

Reputation: 13

Scikit-Multiflow - Cannot take a larger sample than population when 'replace'=False

So I was trying to run the following code, where x is a feature vector with dimensions (2381,) and y is a label with dimension (1,) after being cast to a Numpy array.

from skmultiflow.meta import AdaptiveRandomForestClassifier
import numpy as np
import data

np.random.seed(1)


def main() -> None:
    dataset = data.get_full_dataset()
    metadata = data.get_metadata()
    training_batch = data.get_windows(dataset, metadata, data.get_initial_training_groups())
    streaming_batch = data.get_windows(dataset, metadata, data.get_incremental_learning_groups())
    initial_features = np.concatenate([dataset.feature_vectors for group, dataset in training_batch])
    initial_labels = np.concatenate([dataset.labels for group, dataset in training_batch])
    model = AdaptiveRandomForestClassifier()
    correct_count = 0
    n_samples = 0
    for x, y in zip(initial_features, initial_labels):
        y = np.asarray([y])
        y_prediction = model.predict(x)
        if y_prediction[0] == y:
            correct_count += 1
        model.partial_fit(x, y)
        n_samples += 1

    print(f"Accuracy: {correct_count / n_samples}")


if __name__ == "__main__":
    main()

However, I am yielding the following error:

Traceback (most recent call last):
  File "/home/nathan/Documents/Research/BodmasOnline/main.py", line 31, in <module>
    main()
  File "/home/nathan/Documents/Research/BodmasOnline/main.py", line 24, in main
    model.partial_fit(x, y)
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 313, in partial_fit
    self._partial_fit(X[i], y[i], self.classes, weight[i])
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 328, in _partial_fit
    self.ensemble[i].partial_fit(np.asarray([X]), np.asarray([y]),
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 569, in partial_fit
    self.classifier.partial_fit(X, y, classes=classes, sample_weight=sample_weight)
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/hoeffding_tree.py", line 394, in partial_fit
    self._partial_fit(X[i], y[i], sample_weight[i])
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/hoeffding_tree.py", line 424, in _partial_fit
    learning_node.learn_from_instance(X, y, sample_weight, self)
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_nb_adaptive.py", line 54, in learn_from_instance
    super().learn_from_instance(X, y, weight, ht)
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_classification.py", line 58, in learn_from_instance
    self.list_attributes = self._sample_features(get_dimensions(X)[1])
  File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_classification.py", line 72, in _sample_features
    return self.random_state.choice(
  File "mtrand.pyx", line 965, in numpy.random.mtrand.RandomState.choice
ValueError: Cannot take a larger sample than population when 'replace=False'

Can anyone help me out?

Upvotes: 0

Views: 181

Answers (1)

Nate Waltz
Nate Waltz

Reputation: 13

Going to answer my own question, since scikit-multiflow does not necessarily have the best documentation. The feature vector x has to have dimensions (1, n), which in this case yields (1, 2381). This can be achieved programmatically as follows:

from skmultiflow.meta import AdaptiveRandomForestClassifier
import numpy as np
import data

np.random.seed(1)


def main() -> None:
    dataset = data.get_full_dataset()
    metadata = data.get_metadata()
    training_batch = data.get_windows(dataset, metadata, data.get_initial_training_groups())
    streaming_batch = data.get_windows(dataset, metadata, data.get_incremental_learning_groups())
    initial_features = np.concatenate([dataset.feature_vectors for group, dataset in training_batch])
    initial_labels = np.concatenate([dataset.labels for group, dataset in training_batch])
    model = AdaptiveRandomForestClassifier()
    correct_count = 0
    n_samples = 0
    for x, y in zip(initial_features, initial_labels):
        x = np.expand_dims(x, axis=0)
        y = np.asarray([y])
        y_prediction = model.predict(x)
        if y_prediction[0] == y:
            correct_count += 1
        model.partial_fit(x, y)
        n_samples += 1

    print(f"Accuracy: {correct_count / n_samples}")


if __name__ == "__main__":
    main()

Upvotes: 0

Related Questions