devblack.exe
devblack.exe

Reputation: 584

Find character occurance percentage from a List of words

I would like to create a function that loops a list with words (strings) and returns the occurrences percentage of each character (in alphabetical order) that exists inside the list.

List with words as strings:

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']

So far I have tried:

character_list = []
printable =''
character_count = 0

for word in word_list:
    for character in word:
        character_list.append(character)
        
for word in word_list:
    for character in word:
        printable += "{}: ".format(character)
        for character_match in character_list:
            if character == character_match:
                character_count += 1
        printable += "{}, ".format(character_count)
        character_count = 0

print(printable)

Outputs each letter and their occurrences:

T: 79, H: 31, E: 92, Z: 1, E: 92, N: 42, O: 43, F: 12, P: 22, Y: 17, T: 79, H: 31, O: 43, N: 42, B: 21, Y: 17, T: 79, I: 53, M: 16, P: 22, E: 92, T: 79, E: 92, R: 33, S: 46, B: 21, E: 92, A: 53, U: 21, T: 79, I: 53, F: 12, U: 21, L: 33, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, U: 21, G: 11, L: 33, Y: 17, E: 92, X: 6, P: 22, L: 33, I: 53, C: 17, I: 53, T: 79, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, I: 53, M: 16, P: 22, L: 33, I: 53, C: 17, I: 53, T: 79, S: 46, I: 53, M: 16, P: 22, L: 33, E: 92, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, C: 17, O: 43, M: 16, P: 22, L: 33, E: 92, X: 6, C: 17, O: 43, M: 16, P: 22, L: 33, E: 92, X: 6, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, C: 17, O: 43, M: 16, P: 22, L: 33, I: 53, C: 17, A: 53, T: 79, E: 92, D: 17, F: 12, L: 33, A: 53, T: 79, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, N: 42, E: 92, S: 46, T: 79, E: 92, D: 17, S: 46, P: 22, A: 53, R: 33, S: 46, E: 92, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, D: 17, E: 92, N: 42, S: 46, E: 92, R: 33, E: 92, A: 53, D: 17, A: 53, B: 21, I: 53, L: 33, I: 53, T: 79, Y: 17, C: 17, O: 43, U: 21, N: 42, T: 79, S: 46, S: 46, P: 22, E: 92, C: 17, I: 53, A: 53, L: 33, C: 17, A: 53, S: 46, E: 92, S: 46, A: 53, R: 33, E: 92, N: 42, T: 79, S: 46, P: 22, E: 92, C: 17, I: 53, A: 53, L: 33, E: 92, N: 42, O: 43, U: 21, G: 11, H: 31, T: 79, O: 43, B: 21, R: 33, E: 92, A: 53, K: 2, T: 79, H: 31, E: 92, R: 33, U: 21, L: 33, E: 92, S: 46, A: 53, L: 33, T: 79, H: 31, O: 43, U: 21, G: 11, H: 31, P: 22, R: 33, A: 53, C: 17, T: 79, I: 53, C: 17, A: 53, L: 33, I: 53, T: 79, Y: 17, B: 21, E: 92, A: 53, T: 79, S: 46, P: 22, U: 21, R: 33, I: 53, T: 79, Y: 17, E: 92, R: 33, R: 33, O: 43, R: 33, S: 46, S: 46, H: 31, O: 43, U: 21, L: 33, D: 17, N: 42, E: 92, V: 5, E: 92, R: 33, P: 22, A: 53, S: 46, S: 46, S: 46, I: 53, L: 33, E: 92, N: 42, T: 79, L: 33, Y: 17, U: 21, N: 42, L: 33, E: 92, S: 46, S: 46, E: 92, X: 6, P: 22, L: 33, I: 53, C: 17, I: 53, T: 79, L: 33, Y: 17, S: 46, I: 53, L: 33, E: 92, N: 42, C: 17, E: 92, D: 17, I: 53, N: 42, T: 79, H: 31, E: 92, F: 12, A: 53, C: 17, E: 92, O: 43, F: 12, A: 53, M: 16, B: 21, I: 53, G: 11, U: 21, I: 53, T: 79, Y: 17, R: 33, E: 92, F: 12, U: 21, S: 46, E: 92, T: 79, H: 31, E: 92, T: 79, E: 92, M: 16, P: 22, T: 79, A: 53, T: 79, I: 53, O: 43, N: 42, T: 79, O: 43, G: 11, U: 21, E: 92, S: 46, S: 46, T: 79, H: 31, E: 92, R: 33, E: 92, S: 46, H: 31, O: 43, U: 21, L: 33, D: 17, B: 21, E: 92, O: 43, N: 42, E: 92, A: 53, N: 42, D: 17, P: 22, R: 33, E: 92, F: 12, E: 92, R: 33, A: 53, B: 21, L: 33, Y: 17, O: 43, N: 42, L: 33, Y: 17, O: 43, N: 42, E: 92, O: 43, B: 21, V: 5, I: 53, O: 43, U: 21, S: 46, W: 4, A: 53, Y: 17, T: 79, O: 43, D: 17, O: 43, I: 53, T: 79, A: 53, L: 33, T: 79, H: 31, O: 43, U: 21, G: 11, H: 31, T: 79, H: 31, A: 53, T: 79, W: 4, A: 53, Y: 17, M: 16, A: 53, Y: 17, N: 42, O: 43, T: 79, B: 21, E: 92, O: 43, B: 21, V: 5, I: 53, O: 43, U: 21, S: 46, A: 53, T: 79, F: 12, I: 53, R: 33, S: 46, T: 79, U: 21, N: 42, L: 33, E: 92, S: 46, S: 46, Y: 17, O: 43, U: 21, R: 33, E: 92, D: 17, U: 21, T: 79, C: 17, H: 31, N: 42, O: 43, W: 4, I: 53, S: 46, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, N: 42, E: 92, V: 5, E: 92, R: 33, A: 53, L: 33, T: 79, H: 31, O: 43, U: 21, G: 11, H: 31, N: 42, E: 92, V: 5, E: 92, R: 33, I: 53, S: 46, O: 43, F: 12, T: 79, E: 92, N: 42, B: 21, E: 92, T: 79, T: 79, E: 92, R: 33, T: 79, H: 31, A: 53, N: 42, R: 33, I: 53, G: 11, H: 31, T: 79, N: 42, O: 43, W: 4, I: 53, F: 12, T: 79, H: 31, E: 92, I: 53, M: 16, P: 22, L: 33, E: 92, M: 16, E: 92, N: 42, T: 79, A: 53, T: 79, I: 53, O: 43, N: 42, I: 53, S: 46, H: 31, A: 53, R: 33, D: 17, T: 79, O: 43, E: 92, X: 6, P: 22, L: 33, A: 53, I: 53, N: 42, I: 53, T: 79, S: 46, A: 53, B: 21, A: 53, D: 17, I: 53, D: 17, E: 92, A: 53, I: 53, F: 12, T: 79, H: 31, E: 92, I: 53, M: 16, P: 22, L: 33, E: 92, M: 16, E: 92, N: 42, T: 79, A: 53, T: 79, I: 53, O: 43, N: 42, I: 53, S: 46, E: 92, A: 53, S: 46, Y: 17, T: 79, O: 43, E: 92, X: 6, P: 22, L: 33, A: 53, I: 53, N: 42, I: 53, T: 79, M: 16, A: 53, Y: 17, B: 21, E: 92, A: 53, G: 11, O: 43, O: 43, D: 17, I: 53, D: 17, E: 92, A: 53, N: 42, A: 53, M: 16, E: 92, S: 46, P: 22, A: 53, C: 17, E: 92, S: 46, A: 53, R: 33, E: 92, O: 43, N: 42, E: 92, H: 31, O: 43, N: 42, K: 2, I: 53, N: 42, G: 11, G: 11, R: 33, E: 92, A: 53, T: 79, I: 53, D: 17, E: 92, A: 53, L: 33, E: 92, T: 79, S: 46, D: 17, O: 43, M: 16, O: 43, R: 33, E: 92, O: 43, F: 12, T: 79, H: 31, O: 43, S: 46, E: 92, 

Desired output:

A: 7.83%
B: 3.10%
C: 2.51% 
.
.
.
Z: 0.15%

Upvotes: 2

Views: 1770

Answers (3)

Awais Latif
Awais Latif

Reputation: 89

You can use dictionary to count characters in all words and count character. then go through whole dict and calculate percentage of character.

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE'] 



character_list = []
printable =''
character_count = 0
char_dict = {}
per_dict = {}
for word in word_list:
   for character in word:
      character_count=character_count+1
      if(character in char_dict):
         char_dict[character]=char_dict[character]+1
      else:
         char_dict[character]=1

for k in char_dict.keys():
    per_dict[k]=round((char_dict[k]/character_count)*100,2)

print(per_dict)

Upvotes: 1

Tobi208
Tobi208

Reputation: 1376

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']

word_str = ''.join(word_list)
word_length = len(word_str)

character_count = dict()
for character in word_str:
    if character in character_count:
        # character has occured at least once already  
        character_count[character] += 1
    else:
        # character occurs for the first time
        character_count[character] = 1

for character, count in sorted(character_count.items()):
    print(f'{character}: {round(count / word_length * 100, 2)}%')

This is a fairly straight-forward solution. First, I make a single string out of your list of strings to simply things a bit and the individual strings do not appear to be necessary for this. ''.join(word_list) concatenates the strings with an empty string between them.

word_str -> 'THEZENOFPYTHON...'

The total number of characters len(word_str) is necessary to calculate the percentage occurance of a character.

There are probably more elegant ways to count the characters, but I find a dictionary (character_count) easier to read/explain.

In the for-loop, I check if a counter for a character already exists in the dictionary. If it does, increment the counter. If it doesn't, initiate the counter with the value 1.

Now, the dictionary contains a count of every occuring character:

character_count -> {
    'T': 10,
    'H': 5,
    ...
}

Next, to print the values the way you want, we can iterate over the keys and values of the dictionary with character_count.items(). However, use sorted() to make them appear in lexicographic order.

I calculate the percentage directly in the formatted string. count / word_length would be something like 0.018912347, so I multiply by 100 and then use round( ... , 2) to only display up to two decimal digits.

Upvotes: 2

Tom Aarsen
Tom Aarsen

Reputation: 1200

You can use collections' Counter for this, and then divide by the total number of characters:

from collections import Counter

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']

# Combine all words together
chars = "".join(word_list)
n_chars = len(chars)
# Count all characters
counter = Counter(chars)
# Get frequency and convert frequency into percentages
occ_pct = [(char, occ / n_chars * 100) for char, occ in counter.most_common()]
print(occ_pct)
[('E', 13.58936484490399), ('T', 11.669128508124077), ('I', 7.828655834564254), ('A', 7.828655834564254), ('S', 6.794682422451995), ('O', 6.3515509601181686), ('N', 6.20384047267356), ('R', 4.874446085672083), ('L', 4.874446085672083), ('H', 4.579025110782865), ('P', 3.2496307237813884), ('B', 3.10192023633678), ('U', 3.10192023633678), ('Y', 2.511078286558346), ('C', 2.511078286558346), ('D', 2.511078286558346), ('M', 2.363367799113737), ('F', 1.7725258493353029), ('G', 1.6248153618906942), ('X', 0.8862629246676514), ('V', 0.7385524372230428), ('W', 0.5908419497784343), ('K', 0.29542097488921715), ('Z', 0.14771048744460857)]

This can easily be printed out nicely:

for occ, pct in occ_pct:
    print(occ, f"{pct:.2f}%")
E 13.59%
T 11.67%
I 7.83%
A 7.83%
S 6.79%
O 6.35%
N 6.20%
R 4.87%
L 4.87%
H 4.58%
P 3.25%
B 3.10%
U 3.10%
Y 2.51%
C 2.51%
D 2.51%
M 2.36%
F 1.77%
G 1.62%
X 0.89%
V 0.74%
W 0.59%
K 0.30%
Z 0.15%

Or sorted alphabetically by character instead:

for occ, pct in sorted(occ_pct, key=lambda x: x[0]):
    print(occ, f"{pct:.2f}%")
A 7.83%
B 3.10%
C 2.51%
D 2.51%
E 13.59%
F 1.77%
G 1.62%
H 4.58%
I 7.83%
K 0.30%
L 4.87%
M 2.36%
N 6.20%
O 6.35%
P 3.25%
R 4.87%
S 6.79%
T 11.67%
U 3.10%
V 0.74%
W 0.59%
X 0.89%
Y 2.51%
Z 0.15%

Edit: As requested, without collections:

# Mapping from character to usage frequency
usage = {}
# The total number of characters
total = 0
for word in word_list:
    for char in word:
        # Add 1 to the value corresponding to the `char` key.
        # (and set to 1 if it doesn't exist yet)
        usage[char] = usage.get(char, 0) + 1
        total += 1
# usage.items() returns a list of characters/occurrences, so we can use
# that alongside `total` to compute percentages.
occ_pct = [(char, occ / total * 100) for char, occ in usage.items()]

Upvotes: 3

Related Questions