Reputation: 58821
I have a (large) integer array like
materials = [0, 0, 47, 0, 2, 2, 47] # ...
with few unique entries and I'd like to convert it into a dictionary of indices, i.e.,
d = {
0: [0, 1, 3],
2: [4, 5],
47: [2, 6],
}
What's the most efficient way of doing so? (NumPy welcome.)
Upvotes: 5
Views: 1198
Reputation: 58821
Another one-liner, this time with numpy.where
:
out = {v: np.where(v == a)[0] for v in numpy.unique(a)}
(For some applications, the Boolean array may be enough:
out = {v: v == a for v in numpy.unique(a)}
)
Note that numpy.unique
is faster than set()
for large arrays, and by a large margin if there are only a few unique entries.
Anyhow, for most array sizes, the above is the fastest method yet:
Code:
import numpy as np
from collections import defaultdict
import perfplot
def pp(a):
index = np.argsort(a, kind='mergesort')
as_ = a[index]
jumps = np.r_[0, 1 + np.where(np.diff(as_) != 0)[0]]
pp_out = {k: v for k, v in zip(as_[jumps], np.split(index, jumps[1:]))}
return pp_out
def pp2(a):
index = np.argsort(a)
as_ = a[index]
jumps = np.r_[0, 1 + np.where(np.diff(as_) != 0)[0]]
pp_out = {k: np.sort(v)
for k, v in zip(as_[jumps], np.split(index, jumps[1:]))}
return pp_out
def Denziloe_JFFabre(a):
df_out = {v: [i for i, x in enumerate(a) if x == v] for v in np.unique(a)}
return df_out
def FCouzo(a):
fc_out = defaultdict(list)
for i, elem in enumerate(a):
fc_out[elem].append(i)
return fc_out
def KKSingh(a):
kks_out = defaultdict(list)
list(map(lambda x: kks_out[x[0]].append(x[1]), zip(a, range(len(a)))))
return kks_out
def TMcDonaldJensen(a):
mdj_out = defaultdict(list)
for i, elem in enumerate(a):
mdj_out[elem].append(i)
return mdj_out
def RomanPerekhrest(a):
rp_out = {}
for k, m in enumerate(a):
rp_out.setdefault(m, []).append(k)
return rp_out
def SchloemerHist(a):
np.histogram(a, bins=np.arange(min(a), max(a)+2))
return
def SchloemerWhere(a):
out = {v: np.where(v == a)[0] for v in np.unique(a)}
return out
def SchloemerBooleanOnly(a):
out = {v: v == a for v in np.unique(a)}
return out
perfplot.show(
setup=lambda n: np.random.randint(0, 100, n),
kernels=[
pp, pp2, Denziloe_JFFabre, FCouzo, KKSingh,
TMcDonaldJensen, RomanPerekhrest, SchloemerHist, SchloemerWhere,
SchloemerBooleanOnly
],
n_range=[2**k for k in range(17)],
xlabel='len(a)',
logx=True,
logy=True,
)
Upvotes: 1
Reputation: 53069
Here a numpy solution:
import numpy as np
a = np.random.randint(0, 1000, 1000000)
index = np.argsort(a, kind='mergesort')
as_ = a[index]
jumps = np.r_[0, 1 + np.where(np.diff(as_) != 0)[0]]
result = {k: v for k, v in zip(as_[jumps], np.split(index, jumps[1:]))}
numpy
wins for not too large n
; since it uses an O(n log n) sort algorithm, the margins are slim (pp2 is a variant that replaces the slow but stable mergesort with quicksort at the cost of having to sort the individual index lists afterwards, pp3 replaces the full sort with argpartition
this gains some speed if the number of unique elements is small compared to the number of elements.):
10 different integer values in the original array:
100 different integer values in the original array:
Benchmark code for reference:
import numpy as np
from collections import defaultdict
import perfplot
def pp(a):
index = np.argsort(a, kind='mergesort')
as_ = a[index]
jumps = np.r_[0, 1 + np.where(np.diff(as_) != 0)[0]]
pp_out = {k: v for k, v in zip(as_[jumps], np.split(index, jumps[1:]))}
return pp_out
def pp2(a):
index = np.argsort(a)
as_ = a[index]
jumps = np.r_[0, 1 + np.where(np.diff(as_) != 0)[0]]
pp_out = {k: np.sort(v)
for k, v in zip(as_[jumps], np.split(index, jumps[1:]))}
return pp_out
def Denziloe_JFFabre(a):
df_out = {v: [i for i, x in enumerate(a) if x == v] for v in set(a)}
return df_out
def FCouzo(a):
fc_out = defaultdict(list)
for i, elem in enumerate(a):
fc_out[elem].append(i)
return fc_out
def KKSingh(a):
kks_out = defaultdict(list)
list(map(lambda x: kks_out[x[0]].append(x[1]), zip(a, range(len(a)))))
return kks_out
def TMcDonaldJensen(a):
mdj_out = defaultdict(list)
for i, elem in enumerate(a):
mdj_out[elem].append(i)
return mdj_out
def RomanPerekhrest(a):
rp_out = {}
for k, m in enumerate(a):
rp_out.setdefault(m, []).append(k)
return rp_out
def SchloemerHist(a):
np.histogram(a, bins=np.arange(min(a), max(a)+2))
return
def SchloemerWhere(a):
out = {v: np.where(v == a)[0] for v in set(a)}
return out
perfplot.show(
setup=lambda n: np.random.randint(0, 10, n),
kernels=[
pp, pp2, Denziloe_JFFabre, FCouzo, KKSingh,
TMcDonaldJensen, RomanPerekhrest, SchloemerHist, SchloemerWhere
],
n_range=[2**k for k in range(19)],
xlabel='len(a)',
logx=True,
logy=True,
)
Upvotes: 3
Reputation: 58821
For the fun of it, here's a solution using numpy.histogram
:
np.histogram(a, bins=np.arange(min(a), max(a)+2))
I thought it might perform well, but Paul's solution is still better:
Upvotes: 0
Reputation: 92854
Alternative solution using enumerate() and dict.setdefault() functions:
materials = [0, 0, 47, 0, 2, 2, 47]
d = {}
for k,m in enumerate(materials):
d.setdefault(m, []).append(k)
print(d)
The output:
{0: [0, 1, 3], 2: [4, 5], 47: [2, 6]}
Upvotes: 4
Reputation: 21453
you may find collections.defaultdict
to be of use here, when an element is found for first time it will create a new list for you.
from collections import defaultdict
indices = defaultdict(list)
for i, elem in enumerate(materials):
indices[elem].append(i)
Upvotes: 3
Reputation: 11496
I'd use defaultdict
, it is more efficient (O(n)
time, compared to Jean's answer, which is O(n^2)
):
from collections import defaultdict
materials = [0, 0, 47, 0, 2, 2, 47]
d = defaultdict(list)
for i, elem in enumerate(materials):
d[elem].append(i)
d
is now equal to:
defaultdict(<type 'list'>, {0: [0, 1, 3], 2: [4, 5], 47: [2, 6]})
Upvotes: 1
Reputation: 8132
Comprehensions can do this nicely:
d = {key:[i for i, v in enumerate(materials) if v == key] for key in set(materials)}
Upvotes: 1
Reputation: 140246
no need for numpy
, those are standard python structures, dict comprehension does that very well for your problem:
materials = [0, 0, 47, 0, 2, 2, 47]
d = {v : [i for i,x in enumerate(materials) if x==v] for v in set(materials)}
print(d)
result:
{0: [0, 1, 3], 2: [4, 5], 47: [2, 6]}
[i for i,x in enumerate(materials) if x==v]
finds all the indexes of the element in the list (index
only finds the first one)
In the first version of my answer I was iterating on the list itself, but that's a bit wasteful since it will overwrite the key several times when there are a lot of occurrences, and the inner list comprehension has n
complexity so the overall complexity is not so good.
While I was writing this final comment, someone suggested to iterate on unique elements, which is good, so turn that input list to a set
!
Upvotes: 4