Reputation: 2439
I need to extract unique names with titles such as Lord|Baroness|Lady|Baron from text and match it with another list. I struggle to get the right result and hope the community can help me. Thanks!
import re
def get_names(text):
# find nobel titles and grab it with the following name
match = re.compile(r'(Lord|Baroness|Lady|Baron) ([A-Z][a-z]+) ([A-Z][a-z]+)')
names = list(set(match.findall(text)))
# remove duplicates based on the index in tuples
names_ = list(dict((v[1],v) for v in sorted(names, key=lambda names: names[0])).values())
names_lst = list(set([' '.join(map(str, name)) for name in names_]))
return names_lst
text = 'Baroness Firstname Surname and Baroness who is also known as Lady Anothername and Lady Surname or Lady Firstname.'
names_lst = get_names(text)
print(names_lst)
Which now yields:['Baroness Firstname Surname']
Desired output: ['Baroness Firstname Surname', 'Lady Anothername']
but NOT Lady Surname
or Lady Firstname
Then I need to match the result with this list:
other_names = ['Firstname Surname', 'James', 'Simon Smith']
and drop the element 'Firstname Surname'
from it because it matches the first name and surname of the Baroness in 'the desired output'.
Upvotes: 0
Views: 40
Reputation: 6526
I suggest you the following solution:
import re
def get_names(text):
# find nobel titles and grab it with the following name
match = re.compile(r'(Lord|Baroness|Lady|Baron) ([A-Z][a-z]+)[ ]?([A-Z][a-z]+)?')
names = list(match.findall(text))
# keep only the first title encountered
d = {}
for name in names:
if name[0] not in d:
d[name[0]] = ' '.join(name[1:3]).strip()
return d
text = 'Baroness Firstname Surname and Baroness who is also known as Lady Anothername and Lady Surname or Lady Firstname.'
other_names = ['Firstname Surname', 'James', 'Simon Smith']
names_dict = get_names(text)
print(names_dict)
# {'Baroness': 'Firstname Surname', 'Lady': 'Anothername'}
print([' '.join([k,v]) for k,v in names_dict.items()])
# ['Baroness Firstname Surname', 'Lady Anothername']
other_names_dropped = [name for name in other_names if name not in names_dict.values()]
print(other_names_dropped)
# ['James', 'Simon Smith']
Upvotes: 1