Reputation: 2351
I have a dictionary of lists. Each list is a sequence of numbers. No two lists are equal but two or more lists may start with the same sequence of numbers (see my example input below). What I want to do is find these common sequences and make them a new element in the dictionary.
Sample input:
sequences = {
18: [1, 3, 5, 6, 8, 12, 15, 17, 18],
19: [1, 3, 5, 6, 9, 13, 14, 16, 19],
25: [1, 3, 5, 6, 9, 13, 14, 20, 25],
11: [0, 2, 4, 7, 11],
20: [0, 2, 4, 10, 20],
26: [21, 23, 26],
}
Sample output:
expected_output = {
6: [1, 3, 5, 6],
18: [8, 12, 15, 17, 18],
14: [9, 13, 14],
19: [16, 19],
25: [20, 25],
4: [0, 2, 4],
11: [7, 11],
20: [10, 20],
26: [21, 23, 26],
}
The key of each list is its last element. Order doesn't matter.
I have a working code. However, it's very messy. Could somebody suggest a simpler/cleaner solution?
from collections import Counter
def split_lists(sequences):
# get first elem from each sequence
firsts = list(map(lambda s: s[0], sequences))
# get non-duplicate first elements
not_duplicates = list(map(lambda c: c[0], filter(lambda c: c[1] == 1, Counter(firsts).items())))
# start the new_sequences with the non-duplicate lists
new_sequences = dict(map(lambda s: (s[-1], s), filter(lambda s: s[0] in not_duplicates, sequences)))
# get duplicate first elements
duplicates = list(map(lambda c: c[0], filter(lambda c: c[1] > 1, Counter(firsts).items())))
for duplicate in duplicates:
# get all lists that start with the duplicate element
duplicate_lists = list(filter(lambda s: s[0] == duplicate, sequences))
# get the common elements from the duplicate lists and make it a new
# list to add to our new_sequences dict
repeated_sequence = sorted(list(set.intersection(*list(map(set, duplicate_lists)))))
new_sequences[repeated_sequence[-1]] = repeated_sequence
# get lists from where I left of
i = len(repeated_sequence)
sub_lists = list(filter(lambda s: len(s) > 0, map(lambda s: s[i:], duplicate_lists)))
# recursively split them and store them in new_sequences
new_sequences.update(split_lists(sub_lists))
return new_sequences
Also, could you help me figure out the complexity of my algo? Recursion makes me dizzy. My best guess is O(n*m)
where n
is the number of lists and m
the length of the longest list.
Upvotes: 6
Views: 1222
Reputation: 31339
Using some functional tools this is what I came up with (assuming sequences are sorted). The gist is in find_longest_prefixes
:
#!/usr/bin/env python
from itertools import chain, takewhile
from collections import defaultdict
sequences = {
18: [1, 3, 5, 6, 8, 12, 15, 17, 18],
19: [1, 3, 5, 6, 9, 13, 14, 16, 19],
25: [1, 3, 5, 6, 9, 13, 14, 20, 25],
11: [0, 2, 4, 7, 11],
20: [0, 2, 4, 10, 20],
26: [21, 23, 26],
}
def flatmap(f, it):
return chain.from_iterable(map(f, it))
def all_items_equal(items):
return len(set(items)) == 1
def group_by_first_item(ls):
groups = defaultdict(list)
for l in ls:
groups[l[0]].append(l)
return groups.values()
def find_longest_prefixes(ls):
# takewhile gives us common prefix easily
longest_prefix = list(takewhile(all_items_equal, zip(*ls)))
if longest_prefix:
yield tuple(vs[0] for vs in longest_prefix)
# yield suffix per iterable
leftovers = filter(None, tuple(l[len(longest_prefix):] for l in ls))
leftover_groups = group_by_first_item(leftovers)
yield from flatmap(find_longest_prefixes, leftover_groups)
# apply the prefix finder to all groups
all_sequences = find_longest_prefixes(sequences.values())
# create the result structure expected
results = {v[-1]: v for v in all_sequences}
print(results)
The value for result is then:
{4: (0, 2, 4),
6: (1, 3, 5, 6),
11: (7, 11),
18: (8, 12, 15, 17, 18),
19: (9, 13, 14, 16, 19),
20: (10, 20),
25: (9, 13, 14, 20, 25),
26: (21, 23, 26)}
Note that these are tuples which I prefer for their immutability.
Upvotes: 2
Reputation: 7058
seperate this into logical functions:
Can be done easily with a defaultdict
from collections import defaultdict
def same_start(sequences):
same_start = defaultdict(list)
for seq in sequences:
same_start[seq[0]].append(seq)
return same_start.values()
list(same_start(sequences.values()))
[[[1, 3, 5, 6, 8, 12, 15, 17, 18],
[1, 3, 5, 6, 9, 13, 14, 16, 19],
[1, 3, 5, 6, 9, 13, 14, 20, 25]],
[[0, 2, 4, 7, 11], [0, 2, 4, 10, 20]],
[[21, 23, 26]]]
a simple generator that yields values as long as they are all the same
def get_beginning(sequences):
for values in zip(*sequences):
v0 = values[0]
if not all(i == v0 for i in values):
return
yield v0
def aggregate(same_start):
for seq in same_start:
if len(seq) < 2:
yield seq[0]
continue
start = list(get_beginning(seq))
yield start
yield from (i[len(start):] for i in seq)
list(aggregate(same_start(sequences.values())))
[[1, 3, 5, 6],
[8, 12, 15, 17, 18],
[9, 13, 14, 16, 19],
[9, 13, 14, 20, 25],
[0, 2, 4],
[7, 11],
[10, 20],
[21, 23, 26]]
If you want to combine sequences 18
and 25
, then you can do something like this
def combine(sequences):
while True:
s = same_start(sequences)
if all(len(i) == 1 for i in s):
return sequences
sequences = tuple(aggregate(s))
{i[-1]: i for i in combine(sequences.values())}
{4: [0, 2, 4],
6: [1, 3, 5, 6],
11: [7, 11],
14: [9, 13, 14],
18: [8, 12, 15, 17, 18],
19: [16, 19],
20: [10, 20],
25: [20, 25],
26: [21, 23, 26]}
Upvotes: 3