Reputation: 37

Deleting duplicates in a list of lists using a criteria

I have a set consisting of sets of 2 elements, the first element is still the word and the second one is the file from where the word comes from and now I need to append the name of the file to the word if the word is the same E.G. input([['word1', 'F1.txt'], ['word1', 'F2.txt'], ['word2', 'F1.txt'], ['word2', 'F2.txt'], ['word3', 'F1.txt'], ['word3', 'F2.txt'], ['word4', 'F2.txt']]) should output [['word1', 'F1.txt', 'F2.txt'], ['word2', 'F1.txt', 'F2.txt'], ['word3', 'F1.txt', 'F2.txt'], ['word4', 'F2.txt']] Can you give me some tips on how to this?

Upvotes: 1

Answers (4)

siria

Reputation: 309

It is possible to use an OrderedDict to solve this. It is a dictionary that allows iteration in the order by which keys were added.

import collections

def remove_dups_pairs(data):
    word_files = collections.OrderedDict()
    for word, file_name in data:
        if word not in word_files.keys():
            word_files.update({word: [file_name]})
        elif file_name not in word_files[word]:
            word_files[word].append(file_name)
    return [[word] + files for word, files in word_files.items()]


print(remove_dups_pairs([["fire", "elem.txt"], ["fire", "things.txt"],
                         ["water", "elem.txt"], ["water", "elem.txt"],
                         ["water", "nature.txt"]]))
print(remove_dups_pairs([['word1', 'F1.txt'], ['word1', 'F2.txt'],
                         ['word2', 'F1.txt'], ['word2', 'F2.txt'],
                         ['word3', 'F1.txt'], ['word3', 'F2.txt'],
                         ['word4', 'F2.txt']]))

Output:

[['fire', 'elem.txt', 'things.txt'], ['water', 'elem.txt', 'nature.txt']]
[['word1', 'F1.txt', 'F2.txt'], ['word2', 'F1.txt', 'F2.txt'], ['word3', 'F1.txt', 'F2.txt'], ['word4', 'F2.txt']]

Upvotes: 0

Happy

Reputation: 99

Also, you can do as below if you wish not to use defaultdict:

inner=[[]]
count = 0
def loockup(data,i, count):
    for j in range(i+1, len(data)):
        if data[i][0] == data[j][0] and data[j][1] not in inner[count]:
            inner[count].append(data[j][1])
    return inner

for i in range(len(data)):
    if data[i][0] in inner[count]:
        inner=loockup(data,i,count)
    else:
        if i!=0:
            count +=1
            inner.append([])
        inner[count].append(data[i][0])
        inner[count].append(data[i][1])
        loockup(data,i, count)
print (inner)

Upvotes: 2

Dani Mesejo

Reputation: 61910

You could use a set and the defaultdict:

from collections import defaultdict


def remove_dups_pairs(lst):
    s = set(map(tuple, lst))
    d = defaultdict(list)
    for word, file in s:
        d[word].append(file)
    return [[key] + values for key, values in d.items()]


print(remove_dups_pairs([["fire", "elem.txt"], ["fire", "things.txt"], ["water", "elem.txt"], ["water", "elem.txt"], ["water", "nature.txt"]]))

Output

[['fire', 'elem.txt', 'things.txt'], ['water', 'elem.txt', 'nature.txt']]

As @ShmulikA mentioned set does not preserve ordering, if you need to preserve ordering you can do it like this:

def remove_dups_pairs(lst):
    d = defaultdict(list)
    seen = set()
    for word, file in lst:
        if (word, file) not in seen:
            d[word].append(file)
            seen.add((word, file))

    return [[key] + values for key, values in d.items()]


print(remove_dups_pairs([["fire", "elem.txt"], ["fire", "things.txt"], ["water", "elem.txt"], ["water", "elem.txt"],
                         ["water", "nature.txt"]]))

Output

[['water', 'elem.txt', 'nature.txt'], ['fire', 'elem.txt', 'things.txt']]

Upvotes: 4

ShmulikA

Reputation: 3744

keeping insertion order using set of seen items:

from collections import defaultdict

def remove_dups_pairs_ordered(lst):
    d = defaultdict(list)

    # stores word,file pairs we already seen
    seen = set()
    for item in lst:
        word, file = item
        key = (word, file)

        # skip adding word,file we already seen before
        if key in seen:
            continue
        seen.add(key)
        d[word].append(file)

    # convert the dict word -> [f1, f2..] into 
    # a list of lists [[word1, f1,f2, ...], [word2, f1, f2...], ...]
    return [[word] + files for word, files in d.items()]

print(remove_dups_pairs_ordered(lst))

outputs:

[['fire', 'elem.txt', 'things.txt'], ['water', 'elem.txt', 'nature.txt']]

without keeping the order using defaultdict & set:

from collections import defaultdict

def remove_dups_pairs(lst):
    d = defaultdict(set)

    for item in lst:
        d[item[0]].add(item[1])
    return [[word] + list(files) for word, files in d.items()]

lst = [
    ["fire","elem.txt"], ["fire","things.txt"],
    ["water","elem.txt"], ["water","elem.txt"],
    ["water","nature.txt"]
]

print(remove_dups_pairs(lst))