Reputation: 165
In the example below, I need to remove only the third "animale" which is alone in the string. How can I do that?
a = 'animale animale eau toilette animale'
Second "animale": dont remove
Third "animale": remove
Upvotes: 1
Views: 644
Reputation: 4893
This one works for both:
'animale animale eau toilette animale'
and
'animale animale eau toilette animale eau eau'
Here's the code:
from collections import Counter
def cleanup(words):
splitted = words.split()
counter = Counter(splitted)
more_than_one = [x for x in counter.keys() if counter[x] > 1]
orphan_indexes = []
before = True
for i in range(len(splitted)):
if i == len(splitted):
break
if i > 0:
before = splitted[i] != splitted[i-1]
if i+1 <= len(splitted):
try:
after = splitted[i] != splitted[i+1]
except IndexError:
after = True
if before and after:
if splitted[i] in more_than_one:
orphan_indexes.append(i)
return ' '.join([
item for i, item in enumerate(splitted)
if i not in orphan_indexes
])
print cleanup('animale animale eau toilette animale')
print cleanup('animale animale eau toilette animale eau eau')
Result:
animale animale eau toilette
animale animale toilette eau eau
Upvotes: 0
Reputation: 8510
how about this
from collections import defaultdict
def remove_no_adjacent_duplicates(string):
position = defaultdict(list)
words = string.split()
for i,w in enumerate(words):
position[w].append(i)
for w,pos_list in position.items():
adjacent = set()
for i in range(1,len(pos_list)):
if pos_list[i-1] +1 == pos_list[i]:
adjacent.update( (pos_list[i-1],pos_list[i]) )
if adjacent:
position[w] = adjacent
else:
position[w] = pos_list[:1]
return " ".join( w for i,w in enumerate(words) if i in position[w] )
print( remove_no_adjacent_duplicates('animale animale eau toilette animale') )
print( remove_no_adjacent_duplicates('animale animale eau toilette animale eau eau' ) )
print( remove_no_adjacent_duplicates('animale eau toilette animale eau eau' ) )
print( remove_no_adjacent_duplicates('animale eau toilette animale eau de eau de toilette' ) )
output
animale animale eau toilette
animale animale toilette eau eau
animale toilette eau eau
animale eau toilette de
explanation
first I record the position of each word in the position
dict, then I proceed to check if there is any adjacent position among them for each word, if there is any I save both it in a set, when that is finished if any is found I exchange the list of position for this set of adjacent otherwise remove all the saved position except for the first, and finally reconstruct the string
Upvotes: 1
Reputation: 3224
If i understand your question correctly, you want to remove any occurrences of words that are duplicates but not adjacent. I think this solution works for that:
from collections import defaultdict
def remove_duplicates(s):
result = []
word_counts = defaultdict(int)
words = s.split()
# count the frequency of each word
for word in words:
word_counts[word] += 1
# loop through all words, and only add to result if either it occurs only once or occurs more than once and the next word is the same as the current word.
for i in range(len(words)-1):
curr_word = words[i]
if word_counts[curr_word] > 1:
if words[i+1] == curr_word:
result.append(curr_word)
result.append(curr_word)
word_counts[curr_word] = -1 # mark as -1 so as not to add again
i += 1 # skip the next word by incrementing i manually because it has already been added
# if there are only two occurrences of the word left but they aren't adjacent, add one and mark the counts so you don't add it again.
elif word_counts[curr_word] < 3:
result.append(curr_word)
word_counts[curr_word] = -1 # mark as -1 so as not to add again
# not adjacent but more than 2 occurrences left so decrement number of occurrences left
else:
word_counts[curr_word] -= 1
elif word_counts[curr_word] == 1:
result.append(curr_word)
word_counts[curr_word] = -1
# Fix off by one error by checking last index
if word_counts[words[-1]] == 1:
result.append(words[-1])
return ' '.join(result)
I think this works for any case where the repeated words aren't adjacent including @Dartmouth's example of 'animale animale eau toilette animale eau eau'.
Sample inputs and outputs:
Inputs Outputs
============================================= =========================================
'animale animale eau toilette animale' ----> 'animale animale eau toilette'
'animale animale eau toilette animale eau eau' ----> 'animale animale toilette eau eau'
'animale eau toilette animale eau eau' ----> 'animale toilette eau eau'
'animale eau toilette animale eau de eau de toilette' ----> 'animale toilette eau de'
'animale animale eau toilette animale eau eau compte' ----> 'animale animale toilette eau eau compte'
Upvotes: 0
Reputation: 1089
a = "animale animale eau toilette animale"
words = a.split()
cleaned_words = []
skip = False
for i in range(len(words)):
word = words[i]
print(word)
if skip:
cleaned_words.append(word)
skip = False
try:
next_word = words[i+1]
print(next_word)
except IndexError:
break
if word == next_word:
cleaned_words.append(word)
skip = True
continue
if word not in cleaned_words:
cleaned_words.append(word)
print(cleaned_words)
Quite an ugly, rough solution, but it gets the job done.
Upvotes: 0