Find differences in list of strings

input list:

['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']

The strings are variables concatenated with a dot. So in first string, variables are A B C D, but in last string variable are A B E GG (variables can be of varying length), but the strings will always have the same number of variables separated by the dot. I would like to group those strings together which have only one different variable. So above would produce 2 groups.

['A.B.C.D','A.A.C.D'] ['A.B.E.F','A.B.E.GG']

The the difference has to be in the same variable. not one difference across all variables. and each string should be included only once, not multiple times across groups.

Real data could have up to 20 variables (but all items in each list will always have the same number of variables), and lists could have several minion strings, so performance is also a concern.

I wrote an algorithm which does it but is too complicated. I also tried via itertools groupby but could not get it to produce the correct results:

import itertools
import difflib

class Grouper:
    def __init__(self, diff):
        self.last = None
        self.diff = diff
    def get_diffs(self, curr_key):
        if self.last == None:
            return []
        dims = curr_key.split('.')
        previous_dims = self.last.split('.')
        diffs = list((dm for dm in difflib.ndiff(dims, previous_dims) if '-' not in dm and '?' not in dm))
        return [n for n, d in enumerate(diffs) if '+' in d]
    
    def __call__(self, item):
        result = self.get_diffs(item)
        print(item)
        self.last = item
        return result


grp = Grouper(data)
groups = []
uniquekeys = []

for k, g in itertools.groupby(data, grp):
    groups.append(list(g))      # Store group iterator as a list
    uniquekeys.append(k)

UPDATE:

More sample input:

['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']

Upvotes: 2

Answers (3)

Ajax1234

Reputation: 71451

You can use a generator function:

from collections import defaultdict
def m(a, b):
   return max(map(len, [a - b, b - a])) < 2

def options(l, ind, a):
   for i in a:
      yield '.'.join(l[:ind]+[i]+l[ind+1:])

def groups(d, ch):
   d1, v, last_ind, ind_set = defaultdict(list), [], None, set()
   for i in d:
      d1[i].append(i.split('.'))
   d1 = dict(d1)
   while d1:
      if not v:
         v.extend([(d[0], i) for i in d1.pop(d[0])])
      elif not ind_set:
         k1 = iter(d1)
         while (n:=next(k1)) is not None and not m(set(v[0][-1]), set(n.split('.'))):
            pass
         if n is not None:
             v.extend([(n, i) for i in d1.pop(n)])
             (last_ind, ind_set), *_ = [(i, j) for i, a in enumerate(zip(*[j for _, j in v])) if len((j:=set(a))) != 1]
         else:
             yield [x for x, _ in v]
             n = next(iter(d1))
             v = [(n, i) for i in d1.pop(n)]
             ind_set, last_ind = set(), None
      else:
         if not (o:=[i for i in options(v[0][-1], last_ind, ch[last_ind] - ind_set) if i in d1]):
             yield [x for x, _ in v]
             n = next(iter(d1))
             v = [(n, i) for i in d1.pop(n)]
             ind_set, last_ind = set(), None
         else:
             v.extend([(y, i) for y in o for i in d1.pop(y)])
   yield from ([] if not v else [[x for x, _ in v]])

def get_groups(d):
   ch = list(map(set, zip(*[i.split('.') for i in d])))
   return list(groups(d, ch))

print(get_groups(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))
print(get_groups(["A.B.C", "A.B.D", "A.E.D"]))
print(get_groups(['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']))

Output:

[['A.A.C.D', 'A.B.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']] 
[['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C'], ['D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.8.6.A.1.B.C']]

Timings:

import string, time
import contextlib, itertools as it
r_s = map('.'.join, it.product(*[string.ascii_uppercase[:10] if not i%2 else string.digits for i in range(6)]))
@contextlib.contextmanager
def timeit(f):
    t = time.time()
    yield 
    print(f'{f} time: {time.time() - t}')

with timeit('groups'):
   _ = [*get_groups([*r_s])]

Output:

groups time: 43.55898189544678

Upvotes: 1

Stef

Reputation: 15505

Solving the set cover problem using a greedy algorithm:

string_list = ['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG', 'A.B.C.F']
list_list = [s.split('.') for s in string_list]
# [['A', 'B', 'C', 'D'], ['A', 'A', 'C', 'D'], ...]

universe = set(range(len(string_list)))
# {0, 1, 2, 3, 4}
# represents string_list by index
# for instance, 3 represents 'A.B.E.GG'

subsets = {frozenset(j for j,l in enumerate(list_list) if l[:i]==l0[:i] and l[i+1:]==l0[i+1:]) for l0 in list_list for i in range(len(l0))}
# {{2}, {2, 3}, {0, 1}, {0, 4}, {3}, {2, 4}, {1}, {4}, {0}}
# represents possible groupings by index
# for instance, {2, 3} represents {'A.B.E.F', 'A.B.E.GG'}

solution = []
while universe:
  max_subset = max(subsets, key=len)
  solution.append(max_subset)
  universe -= max_subset
  subsets.remove(max_subset)
  subsets = {subset & universe for subset in subsets}

print(solution)
# {{2, 3}, {0, 1}, {4}}

clusters = [[string_list[i] for i in subset] for subset in solution]
print(clusters)
# [['A.B.E.F', 'A.B.E.GG'], ['A.B.C.D', 'A.A.C.D'], ['A.B.C.F']]

Upvotes: 1

Be Chiller Too

Reputation: 2910

I added another string "A.B.C.F", this string can be matched with "A.B.C.D" and "A.B.E.F":

import itertools


def main(l: list) -> list:
    splits = [tuple(s.split(".")) for s in l]

    groups = {}
    # Dict of {(tuple, index of difference): list of tuples that match the key}

    for split in splits:
        matched = False
        for (t, i) in groups.keys():
            # We match two splits if they only have one difference
            diffs = [i for i in range(len(split)) if split[i] != t[i]]
            if len(diffs) == 1:
                # The strings match but is the first match of 't'?
                if i is None:
                    groups.pop((t, i))
                    groups[(t, diffs[0])] = [split]
                    # We found a match, stop the matching of this tuple
                    matched = True
                    break
                elif diffs[0] == i:
                    groups[(t, i)].append(split)
                    matched = True
                    break
        if not matched:
            # Let's add this split as a candidate for future splits
            groups[(split, None)] = []

    return [[".".join(k)] + [".".join(s) for s in v] for (k, i), v in groups.items()]


print(main(["A.B.C.D", "A.A.C.D", "A.B.E.F", "A.B.E.GG", "A.B.C.F"]))
# [['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG'], ['A.B.C.F']]
print(main(["A.B.C", "A.B.D", "A.E.D"]))
# [['A.B.C', 'A.B.D'], ['A.E.D']]

Upvotes: 3

Find differences in list of strings

Answers (3)

Related Questions