Reputation: 664

sample n zeros from a sparse.coo_matrix

How do I (efficiently) sample zero values from a scipy.sparse.coo_matrix?

>>> import numpy as np
>>> from scipy.sparse import coo_matrix

>>> # create sparse array
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
>>> X_sparse = coo_matrix(X)

>>> # randomly sample 0's from X_sparse, retrieving as [(row, col), (row_col)]
>>> def sample_zeros(sp_arr, n, replacement=False):
>>>     # ???
>>>     return negs

>>> zero_indices = sample_zeros(X_sparse, n=3, replacement=False)
>>> print(zero_indices)
[(0, 1), (2, 0), (2, 1)]

Efficiency is important here, since I will be doing this in an iterator that feeds a neural network.

Upvotes: 0

Answers (2)

unutbu

Reputation: 879591

Since you know the shape of X, you could use np.random.choice to generate random (row, col) locations in X:

h, w = X.shape
rows = np.random.choice(h, size=n)
cols = np.random.choice(w, size=n)

The main difficulty is how to check if a (row, col) is a non-zero location in X. Here's a way to do that: Make a new sparse X which equals 1 wherever X is nonzero. Next, create a new sparse matrix, Y, with non-zero values at the random locations generated above. Then subtract:

Y = Y - X.multiply(Y)

This sparse matrix Y will be zero wherever X is nonzero. So if we've managed to generate enough nonzero values in Y, then we can use their (row, col) locations as the return value for sample_negs:

import unittest
import sys
import numpy as np
import scipy.sparse as sparse

def sample_negs(X, n=3, replace=False):
    N = np.prod(X.shape)
    m = N - X.size
    if n == 0:
        result = []
    elif (n < 0) or (not replace and m < n) or (replace and m == 0):
        raise ValueError("{n} samples from {m} locations do not exist"
                         .format(n=n, m=m))
    elif n/m > 0.5:
        # Y (in the else clause, below) would be pretty dense so there would be no point 
        # trying to use sparse techniques. So let's use hpaulj's idea 
        # (https://stackoverflow.com/a/53577267/190597) instead.
        import warnings
        warnings.filterwarnings("ignore", category=sparse.SparseEfficiencyWarning)

        Y = sparse.coo_matrix(X == 0)
        rows = Y.row
        cols = Y.col
        idx = np.random.choice(len(rows), size=n, replace=replace)
        result = list(zip(rows[idx], cols[idx]))

    else:
        X_row, X_col = X.row, X.col
        X_data = np.ones(X.size)
        X = sparse.coo_matrix((X_data, (X_row, X_col)), shape=X.shape)

        h, w = X.shape
        Y = sparse.coo_matrix(X.shape)
        Y_size = 0
        while Y_size < n:
            m = n - Y.size
            Y_data = np.concatenate([Y.data, np.ones(m)])
            Y_row = np.concatenate([Y.row, np.random.choice(h, size=m)])
            Y_col = np.concatenate([Y.col, np.random.choice(w, size=m)])
            Y = sparse.coo_matrix((Y_data, (Y_row, Y_col)), shape=X.shape)
            # Remove values in Y where X is nonzero
            # This also consolidates (row, col) duplicates
            Y = sparse.coo_matrix(Y - X.multiply(Y))
            if replace:
                Y_size = Y.data.sum()
            else:
                Y_size = Y.size
        if replace:
            rows = np.repeat(Y.row, Y.data.astype(int))        
            cols = np.repeat(Y.col, Y.data.astype(int))
            idx = np.random.choice(rows.size, size=n, replace=False)
            result = list(zip(rows[idx], cols[idx]))
        else:
            rows = Y.row
            cols = Y.col
            idx = np.random.choice(rows.size, size=n, replace=False)
            result = list(zip(rows[idx], cols[idx]))
    return result


class Test(unittest.TestCase):
    def setUp(self): 
        import warnings
        warnings.filterwarnings("ignore", category=sparse.SparseEfficiencyWarning)

        self.ncols, self.nrows = 100, 100
        self.X = sparse.random(self.ncols, self.nrows, density=0.05, format='coo')
        Y = sparse.coo_matrix(self.X == 0)
        self.expected = set(zip(Y.row, Y.col))

    def test_n_too_large(self):
        self.assertRaises(ValueError, sample_negs, self.X, n=100*100+1, replace=False)

        X_dense = sparse.coo_matrix(np.ones((4,2)))
        self.assertRaises(ValueError, sample_negs, X_dense, n=1, replace=True)

    def test_no_replacement(self):
        for m in range(100):
            negative_list = sample_negs(self.X, n=m, replace=False)
            negative_set = set(negative_list)
            self.assertEqual(len(negative_list), m)
            self.assertLessEqual(negative_set, self.expected)

    def test_no_repeats_when_replace_is_false(self):
        negative_list = sample_negs(self.X, n=10, replace=False)
        self.assertEqual(len(negative_list), len(set(negative_list)))

    def test_dense_replacement(self):
        N = self.ncols * self.nrows
        m = N - self.X.size
        for i in [-1, 0, 1]:
            negative_list = sample_negs(self.X, n=m+i, replace=True)
            negative_set = set(negative_list)
            self.assertEqual(len(negative_list), m+i)
            self.assertLessEqual(negative_set, self.expected)

    def test_sparse_replacement(self):
        for m in range(100):
            negative_list = sample_negs(self.X, n=m, replace=True)
            negative_set = set(negative_list)
            self.assertEqual(len(negative_list), m)
            self.assertLessEqual(negative_set, self.expected)


if __name__ == '__main__':
    sys.argv.insert(1,'--verbose')
    unittest.main(argv = sys.argv)

Since sample_negs is rather complicated, I've included some unit tests to hopefully verify reasonable behavior.

Upvotes: 3

hpaulj

Reputation: 231395

I don't think there's an efficient way that takes advantage of the sparse matrix structure:

In [197]: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
     ...: >>> X_sparse = sparse.coo_matrix(X)
In [198]: X_sparse
Out[198]: 
<3x2 sparse matrix of type '<class 'numpy.float64'>'
    with 3 stored elements in COOrdinate format>
In [199]: print(X_sparse)
  (0, 0)    1.0
  (1, 0)    2.0
  (1, 1)    1.0

With the dense array you could do something like:

In [204]: zeros = np.argwhere(X==0)
In [205]: zeros
Out[205]: 
array([[0, 1],
       [2, 0],
       [2, 1]])
In [206]: idx=np.random.choice(3,3, replace=False)
In [207]: idx
Out[207]: array([0, 2, 1])
In [208]: zeros[idx,:]
Out[208]: 
array([[0, 1],
       [2, 1],
       [2, 0]])

We could ask for all 0s of the sparse matrix:

In [209]: X_sparse==0
/usr/local/lib/python3.6/dist-packages/scipy/sparse/compressed.py:214: SparseEfficiencyWarning: Comparing a sparse matrix with 0 using == is inefficient, try using != instead.
  ", try using != instead.", SparseEfficiencyWarning)
Out[209]: 
<3x2 sparse matrix of type '<class 'numpy.bool_'>'
    with 3 stored elements in Compressed Sparse Row format>
In [210]: print(_)
  (0, 1)    True
  (2, 0)    True
  (2, 1)    True

Upvotes: 1

sample n zeros from a sparse.coo_matrix

Answers (2)

Related Questions