Reputation:
Can someone please explain to me, why I am getting a different result when I remove the lines containing '# duplicate'?
import re
def nysiis(term: str) -> str:
"""
returns New York State Identification and Intelligence Algorithm (NYSIIS) code for given term
"""
if not len(term):
return ''
else:
term = term.upper()
table = {
r'\W+': '', # remove whitespace and non-word characters
r'^MAC': 'MCC',
r'^KN': 'NN',
r'K': 'C',
r'PH|PF': 'FF',
r'SCH': 'SSS',
r'(EE|IE)$': 'Y',
r'(DT|ND|NT|RD|RT)$': 'D',
# From now on first letter must no longer change.
r'(?<!^)EV': 'AF',
r'(?<!^)[AEIOU]': 'A',
r'(?<!^)Q': 'G',
r'(?<!^)Z': 'S',
r'(?<!^)(?:M|KN)': 'N',
r'(?<!^)([^AEIOUY])H': r'\1',
r'(?<!^)(.)H[^AEIOUY]': r'\1',
r'(?<!^)([AEIOUY])W': r'\1',
r'AY$': r'Y',
r'S$': r'',
r'(\w)\1+': r'\1', # original
r'A+$': r'' # original
}
for k, v in table.items():
term = re.sub(k, v, term)
table = { # duplicate
r'(\w)\1+': r'\1', # duplicate
r'A+$': r'' # duplicate
} # duplicate
for k, v in table.items(): # duplicate
term = re.sub(k, v, term) # duplicate
return term
if __name__ == '__main__':
names = [
'Bishop', 'Carlson', 'Carr', 'Chapman', 'Franklin',
'Greene', 'Harper', 'Jacobs', 'Larson', 'Lawrence',
'Lawson', 'Louis, XVI', 'Lynch', 'Mackenzie', 'Matthews',
'McCormack', 'McDaniel', 'McDonald', 'Mclaughlin', 'Morrison',
"O'Banion", "O'Brien", 'Richards', 'Silva', 'Watkins',
'Wheeler', 'Willis', 'brown, sr', 'browne, III', 'browne, IV',
'knight', 'mitchell', "o'daniel",
]
for name in names:
print('%15s: %s' % (name, nysiis(name)))
Upvotes: 1
Views: 67
Reputation: 375574
You don't want to use a dict for your substitutions: the order of iteration is not the same as the order you listed them in. If you change your dicts to a list of pairs instead, then it works as you expect.
table = [
(r'\W+', ''),
#...
]
for k, v in table:
...
Upvotes: 1