Reputation: 1707
input list:
['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']
The strings are variables concatenated with a dot. So in first string, variables are A B C D, but in last string variable are A B E GG (variables can be of varying length), but the strings will always have the same number of variables separated by the dot. I would like to group those strings together which have only one different variable. So above would produce 2 groups.
['A.B.C.D','A.A.C.D'] ['A.B.E.F','A.B.E.GG']
The the difference has to be in the same variable. not one difference across all variables. and each string should be included only once, not multiple times across groups.
Real data could have up to 20 variables (but all items in each list will always have the same number of variables), and lists could have several minion strings, so performance is also a concern.
I wrote an algorithm which does it but is too complicated. I also tried via itertools groupby but could not get it to produce the correct results:
import itertools
import difflib
class Grouper:
def __init__(self, diff):
self.last = None
self.diff = diff
def get_diffs(self, curr_key):
if self.last == None:
return []
dims = curr_key.split('.')
previous_dims = self.last.split('.')
diffs = list((dm for dm in difflib.ndiff(dims, previous_dims) if '-' not in dm and '?' not in dm))
return [n for n, d in enumerate(diffs) if '+' in d]
def __call__(self, item):
result = self.get_diffs(item)
print(item)
self.last = item
return result
grp = Grouper(data)
groups = []
uniquekeys = []
for k, g in itertools.groupby(data, grp):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)
UPDATE:
More sample input:
['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']
Upvotes: 2
Views: 153
Reputation: 71451
You can use a generator function:
from collections import defaultdict
def m(a, b):
return max(map(len, [a - b, b - a])) < 2
def options(l, ind, a):
for i in a:
yield '.'.join(l[:ind]+[i]+l[ind+1:])
def groups(d, ch):
d1, v, last_ind, ind_set = defaultdict(list), [], None, set()
for i in d:
d1[i].append(i.split('.'))
d1 = dict(d1)
while d1:
if not v:
v.extend([(d[0], i) for i in d1.pop(d[0])])
elif not ind_set:
k1 = iter(d1)
while (n:=next(k1)) is not None and not m(set(v[0][-1]), set(n.split('.'))):
pass
if n is not None:
v.extend([(n, i) for i in d1.pop(n)])
(last_ind, ind_set), *_ = [(i, j) for i, a in enumerate(zip(*[j for _, j in v])) if len((j:=set(a))) != 1]
else:
yield [x for x, _ in v]
n = next(iter(d1))
v = [(n, i) for i in d1.pop(n)]
ind_set, last_ind = set(), None
else:
if not (o:=[i for i in options(v[0][-1], last_ind, ch[last_ind] - ind_set) if i in d1]):
yield [x for x, _ in v]
n = next(iter(d1))
v = [(n, i) for i in d1.pop(n)]
ind_set, last_ind = set(), None
else:
v.extend([(y, i) for y in o for i in d1.pop(y)])
yield from ([] if not v else [[x for x, _ in v]])
def get_groups(d):
ch = list(map(set, zip(*[i.split('.') for i in d])))
return list(groups(d, ch))
print(get_groups(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))
print(get_groups(["A.B.C", "A.B.D", "A.E.D"]))
print(get_groups(['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']))
Output:
[['A.A.C.D', 'A.B.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']]
[['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C'], ['D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.8.6.A.1.B.C']]
Timings:
import string, time
import contextlib, itertools as it
r_s = map('.'.join, it.product(*[string.ascii_uppercase[:10] if not i%2 else string.digits for i in range(6)]))
@contextlib.contextmanager
def timeit(f):
t = time.time()
yield
print(f'{f} time: {time.time() - t}')
with timeit('groups'):
_ = [*get_groups([*r_s])]
Output:
groups time: 43.55898189544678
Upvotes: 1
Reputation: 15505
Solving the set cover problem using a greedy algorithm:
string_list = ['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG', 'A.B.C.F']
list_list = [s.split('.') for s in string_list]
# [['A', 'B', 'C', 'D'], ['A', 'A', 'C', 'D'], ...]
universe = set(range(len(string_list)))
# {0, 1, 2, 3, 4}
# represents string_list by index
# for instance, 3 represents 'A.B.E.GG'
subsets = {frozenset(j for j,l in enumerate(list_list) if l[:i]==l0[:i] and l[i+1:]==l0[i+1:]) for l0 in list_list for i in range(len(l0))}
# {{2}, {2, 3}, {0, 1}, {0, 4}, {3}, {2, 4}, {1}, {4}, {0}}
# represents possible groupings by index
# for instance, {2, 3} represents {'A.B.E.F', 'A.B.E.GG'}
solution = []
while universe:
max_subset = max(subsets, key=len)
solution.append(max_subset)
universe -= max_subset
subsets.remove(max_subset)
subsets = {subset & universe for subset in subsets}
print(solution)
# {{2, 3}, {0, 1}, {4}}
clusters = [[string_list[i] for i in subset] for subset in solution]
print(clusters)
# [['A.B.E.F', 'A.B.E.GG'], ['A.B.C.D', 'A.A.C.D'], ['A.B.C.F']]
Upvotes: 1
Reputation: 2910
I added another string "A.B.C.F", this string can be matched with "A.B.C.D" and "A.B.E.F":
import itertools
def main(l: list) -> list:
splits = [tuple(s.split(".")) for s in l]
groups = {}
# Dict of {(tuple, index of difference): list of tuples that match the key}
for split in splits:
matched = False
for (t, i) in groups.keys():
# We match two splits if they only have one difference
diffs = [i for i in range(len(split)) if split[i] != t[i]]
if len(diffs) == 1:
# The strings match but is the first match of 't'?
if i is None:
groups.pop((t, i))
groups[(t, diffs[0])] = [split]
# We found a match, stop the matching of this tuple
matched = True
break
elif diffs[0] == i:
groups[(t, i)].append(split)
matched = True
break
if not matched:
# Let's add this split as a candidate for future splits
groups[(split, None)] = []
return [[".".join(k)] + [".".join(s) for s in v] for (k, i), v in groups.items()]
print(main(["A.B.C.D", "A.A.C.D", "A.B.E.F", "A.B.E.GG", "A.B.C.F"]))
# [['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG'], ['A.B.C.F']]
print(main(["A.B.C", "A.B.D", "A.E.D"]))
# [['A.B.C', 'A.B.D'], ['A.E.D']]
Upvotes: 3