Reputation: 746
Suppose i have 2 numpy arrays which have elements like these:
arr1 = [[1, 2], [3, 5], [3, 4]]
arr2 = [[2, 3], [3, 4], [6, 6]]
I want the resultant array to have arr2
appended or horizontally stacked to
arr1
containing the elements which are NOT present in both the arrays:
arr3 = [[1, 2], [3, 5], [2, 3], [6, 6]]
As you can see, [3, 4]
is not there in the expected resultant array. What is the best numpy and pythonic implementation for this?
Upvotes: 3
Views: 217
Reputation: 53029
I'm a bit late to the party but here is an approach that leverages numpy
speed at only moderate algorithmic cost: O(n log n). It turns out that for many sizes of the input arrays runtime is dominated by the cost of type conversions. See benchmarks below:
from timeit import timeit
import numpy as np
def make_inputs(N, ncol):
global arr1, arr2, list1, list2, lot1, lot2
# create making sure there are no duplicates *within* arr1 or arr2
all_ = np.array(list(set(map(tuple, np.random.randint(0, 2 * N, (N + N//2, ncol))))))
# create input of various data types
arr1 = all_[np.random.choice(len(all_), N, False)]
arr2 = all_[np.random.choice(len(all_), N, False)]
list1 = arr1.tolist()
list2 = arr2.tolist()
lot1 = list(map(tuple, list1))
lot2 = list(map(tuple, list2))
def np_unique_preserve_order(a, b):
c = np.r_[a, b]
cr = c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize)))
uniq, inv, count = np.unique(cr.ravel(), return_inverse=True,
return_counts=True)
return c[(count==1)[inv], :]
def np_unique(a, b):
c = np.r_[a, b]
cr = c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize)))
uniq, count = np.unique(cr.ravel(), return_counts=True)
return uniq[count==1, None].view(c.dtype)
def np_sort(a, b):
c = np.r_[a, b]
cr = np.sort(c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize))).ravel())
m = np.empty(cr.shape, bool)
m[0] = True
m[1:] = cr[:-1] != cr[1:]
m[:-1] &= m[1:]
return cr[m, None].view(c.dtype)
# check
make_inputs(1000, 2)
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_sort(arr1, arr2)))
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_unique(arr1, arr2)))
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_unique_preserve_order(arr1, arr2)))
for N, ncol in ((10, 2), (10000, 2), (100000, 20)):
make_inputs(N, ncol)
results = []
for inputs in 'lot', 'list', 'arr':
res = []
if inputs == 'lot':
res.append('{:11.5f} ms'.format(timeit(f'list(set({inputs}1).symmetric_difference(set({inputs}2)))',
f'from __main__ import {inputs}1, {inputs}2', number=10) * 100))
else:
res.append('{:11.5f} ms'.format(timeit(f'list(set(map(tuple, {inputs}1)).symmetric_difference(set(map(tuple, {inputs}2))))',
f'from __main__ import {inputs}1, {inputs}2', number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_sort({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_sort',
number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_unique({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_unique',
number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_unique_preserve_order({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_unique_preserve_order',
number=10) * 100))
results.append(res)
results = zip(*results)
appmin = lambda l: l + (min(l),)
print(f'\nno rows {N}, no colums {ncol}')
print('input type lot list array best')
print(f'symm_diff ', *appmin(next(results)))
print(f'np_sort ', *appmin(next(results)))
print(f'np_unique ', *appmin(next(results)))
print(f'np_unique_preserve_order ', *appmin(next(results)))
Output:
no rows 10, no colums 2
input type lot list array best
symm_diff 0.00232 ms 0.00453 ms 0.02034 ms 0.00232 ms
np_sort 0.03890 ms 0.03269 ms 0.03060 ms 0.03060 ms
np_unique 0.04209 ms 0.04118 ms 0.04136 ms 0.04118 ms
np_unique_preserve_order 0.05289 ms 0.05253 ms 0.05253 ms 0.05253 ms
no rows 10000, no colums 2
input type lot list array best
symm_diff 3.71800 ms 8.30081 ms 21.92841 ms 3.71800 ms
np_sort 7.98128 ms 8.09973 ms 2.80091 ms 2.80091 ms
np_unique 8.49229 ms 8.25701 ms 3.00654 ms 3.00654 ms
np_unique_preserve_order 10.12377 ms 8.67133 ms 3.36172 ms 3.36172 ms
no rows 100000, no colums 20
input type lot list array best
symm_diff 97.83141 ms 211.20736 ms 590.15145 ms 97.83141 ms
np_sort 317.73268 ms 316.50081 ms 89.97820 ms 89.97820 ms
np_unique 324.68922 ms 326.89891 ms 98.06377 ms 98.06377 ms
np_unique_preserve_order 339.18597 ms 339.00286 ms 120.94041 ms 120.94041 ms
For very small arrays symm_diff
is fastest, but for larger ones np_sort
has a small edge when all methods are allowed to use the data type they are most comfortable with.
Upvotes: 3
Reputation: 1291
arr_3 = list(set(arr_1).symmetric_difference(set(arr_2)))
or:
arr_3 = list(set(map(tuple, arr_1))
.symmetric_difference(set(map(tuple, arr_2))))
A little research:
from random import randint
from timeit import timeit
import itertools
arr1 = [[randint(0, 5), randint(0, 5)] for _ in range(1000)]
arr2 = [[randint(0, 5), randint(0, 5)] for _ in range(1000)]
def symmetric_difference(a, b):
now_exists = set()
results = []
for e in itertools.chain(a, b):
rerp_of_e = repr(e)
if rerp_of_e not in now_exists:
now_exists.add(rerp_of_e)
results.append(e)
return results
def method2(a, b):
c = a + b
return [l for l in c if c.count(l) == 1]
print(timeit('[l for l in arr1 + arr2 if (arr1 + arr2).count(l) == 1]', 'from __main__ import arr1, arr2',
number=10) / 10)
print(timeit('method2(arr1, arr2)', 'from __main__ import arr1, arr2, method2',
number=10) / 10)
print(timeit('list(set(map(tuple, arr1)).symmetric_difference(set(map(tuple, arr2))))',
'from __main__ import arr1, arr2', number=10) / 10)
print(timeit('symmetric_difference(arr1, arr2)', 'from __main__ import arr1, arr2, symmetric_difference',
number=10) / 10)
Output:
0.16653713929933542
0.14676828165012284
0.00030277483018301685
0.0008909022958581315
Upvotes: 2
Reputation: 107287
If only items in same index can be similar you can use a simply numpythonic approach like following:
In [10]: mask = ~(arr1 == arr2).all(1)
In [11]: np.vstack((arr1[mask], arr2[mask]))
Out[11]:
array([[1, 2],
[3, 5],
[2, 3],
[6, 6]])
Otherwise you can find the intersections based on Jaime's answer then exclude them from combined array.
Or use following approach:
In [17]: arr1_view = arr1.view([('', arr1.dtype)] * arr1.shape[1])
In [18]: arr2_view = arr2.view([('', arr2.dtype)] * arr2.shape[1])
In [19]: diff_arr1 = np.setdiff1d(arr1_view, arr2_view).view(arr1.dtype).reshape(-1, arr1.shape[1])
In [20]: diff_arr2 = np.setdiff1d(arr2_view, arr1_view).view(arr2.dtype).reshape(-1, arr2.shape[1])
In [21]: np.vstack((diff_arr1, diff_arr2))
Out[21]:
array([[1, 2],
[3, 5],
[2, 3],
[6, 6]])
If you are dealing with python arrays and specially short arrays you can just use a list comprehension like following:
In [204]: [i for i in arr1 + arr2 if not (i in arr1 and i in arr2)]
Out[204]: [[1, 2], [3, 5], [2, 3], [6, 6]]
Note that its faster than converting the lists to tuple and using set for short arrays bit for larger arrays the set based approach takes the cake, which in that case it's still better to use Numpy:
In [209]: %timeit set(map(tuple, arr1)).symmetric_difference(map(tuple, arr2))
100000 loops, best of 3: 1.51 µs per loop
In [210]: %timeit [i for i in arr1 + arr2 if not (i in arr1 and i in arr2)]
1000000 loops, best of 3: 1.28 µs per loop
Upvotes: 2
Reputation: 3382
how about:
[l for l in arr1+arr2 if (arr1+arr2).count(l)==1]
output:
[[1, 2], [3, 5], [2, 3], [6, 6]]
or, if you want it a bit more efficient:
c=arr1+arr2
[l for l in c if c.count(l)==1]
Upvotes: 2