Reputation: 37
I have a set consisting of sets of 2 elements, the first element is still the word and the second one is the file from where the word comes from and now I need to append the name of the file to the word if the word is the same
E.G. input([['word1', 'F1.txt'], ['word1', 'F2.txt'], ['word2', 'F1.txt'], ['word2', 'F2.txt'], ['word3', 'F1.txt'], ['word3', 'F2.txt'], ['word4', 'F2.txt']])
should output [['word1', 'F1.txt', 'F2.txt'], ['word2', 'F1.txt', 'F2.txt'], ['word3', 'F1.txt', 'F2.txt'], ['word4', 'F2.txt']]
Can you give me some tips on how to this?
Upvotes: 1
Views: 143
Reputation: 309
It is possible to use an OrderedDict to solve this. It is a dictionary that allows iteration in the order by which keys were added.
import collections
def remove_dups_pairs(data):
word_files = collections.OrderedDict()
for word, file_name in data:
if word not in word_files.keys():
word_files.update({word: [file_name]})
elif file_name not in word_files[word]:
word_files[word].append(file_name)
return [[word] + files for word, files in word_files.items()]
print(remove_dups_pairs([["fire", "elem.txt"], ["fire", "things.txt"],
["water", "elem.txt"], ["water", "elem.txt"],
["water", "nature.txt"]]))
print(remove_dups_pairs([['word1', 'F1.txt'], ['word1', 'F2.txt'],
['word2', 'F1.txt'], ['word2', 'F2.txt'],
['word3', 'F1.txt'], ['word3', 'F2.txt'],
['word4', 'F2.txt']]))
Output:
[['fire', 'elem.txt', 'things.txt'], ['water', 'elem.txt', 'nature.txt']]
[['word1', 'F1.txt', 'F2.txt'], ['word2', 'F1.txt', 'F2.txt'], ['word3', 'F1.txt', 'F2.txt'], ['word4', 'F2.txt']]
Upvotes: 0
Reputation: 99
Also, you can do as below if you wish not to use defaultdict:
inner=[[]]
count = 0
def loockup(data,i, count):
for j in range(i+1, len(data)):
if data[i][0] == data[j][0] and data[j][1] not in inner[count]:
inner[count].append(data[j][1])
return inner
for i in range(len(data)):
if data[i][0] in inner[count]:
inner=loockup(data,i,count)
else:
if i!=0:
count +=1
inner.append([])
inner[count].append(data[i][0])
inner[count].append(data[i][1])
loockup(data,i, count)
print (inner)
Upvotes: 2
Reputation: 61910
You could use a set and the defaultdict:
from collections import defaultdict
def remove_dups_pairs(lst):
s = set(map(tuple, lst))
d = defaultdict(list)
for word, file in s:
d[word].append(file)
return [[key] + values for key, values in d.items()]
print(remove_dups_pairs([["fire", "elem.txt"], ["fire", "things.txt"], ["water", "elem.txt"], ["water", "elem.txt"], ["water", "nature.txt"]]))
Output
[['fire', 'elem.txt', 'things.txt'], ['water', 'elem.txt', 'nature.txt']]
As @ShmulikA mentioned set does not preserve ordering, if you need to preserve ordering you can do it like this:
def remove_dups_pairs(lst):
d = defaultdict(list)
seen = set()
for word, file in lst:
if (word, file) not in seen:
d[word].append(file)
seen.add((word, file))
return [[key] + values for key, values in d.items()]
print(remove_dups_pairs([["fire", "elem.txt"], ["fire", "things.txt"], ["water", "elem.txt"], ["water", "elem.txt"],
["water", "nature.txt"]]))
Output
[['water', 'elem.txt', 'nature.txt'], ['fire', 'elem.txt', 'things.txt']]
Upvotes: 4
Reputation: 3744
from collections import defaultdict
def remove_dups_pairs_ordered(lst):
d = defaultdict(list)
# stores word,file pairs we already seen
seen = set()
for item in lst:
word, file = item
key = (word, file)
# skip adding word,file we already seen before
if key in seen:
continue
seen.add(key)
d[word].append(file)
# convert the dict word -> [f1, f2..] into
# a list of lists [[word1, f1,f2, ...], [word2, f1, f2...], ...]
return [[word] + files for word, files in d.items()]
print(remove_dups_pairs_ordered(lst))
outputs:
[['fire', 'elem.txt', 'things.txt'], ['water', 'elem.txt', 'nature.txt']]
from collections import defaultdict
def remove_dups_pairs(lst):
d = defaultdict(set)
for item in lst:
d[item[0]].add(item[1])
return [[word] + list(files) for word, files in d.items()]
lst = [
["fire","elem.txt"], ["fire","things.txt"],
["water","elem.txt"], ["water","elem.txt"],
["water","nature.txt"]
]
print(remove_dups_pairs(lst))
outputs:
[['fire', 'things.txt', 'elem.txt'], ['water', 'nature.txt', 'elem.txt']]
Upvotes: 1