Reputation: 47
consider this list
my_data= [
['58', 'management', 'unknown', 'no'],
['44', 'technician', 'single', 'no'],
['33', 'entrepreneur', 'married', 'no'],
['47', 'blue-collar', 'married', 'no'],
['33', 'unknown', 'single', 'no'],
['35', 'management', 'unknown', 'no'],
['28', 'management', 'single', 'no'],
['42', 'entrepreneur', 'divorced', 'no'],
['58', 'retired', 'married', 'no'],
['43', 'technician', 'unknown', 'no']
]
I want to replace those unknown values with the most common element on the list and something is wrong with my code can anyone please correct it. it seems I need to call remove_unknown() twice to make it work
def most_common(lst):
return max(set(lst), key=lst.count)
def remove_unknowns(ls):
mycols=[]
for cols in range(0, 4):
for rows in xrange(len(ls)):
if (type(ls[rows][cols]) is str):
mycols.extend([ls[rows][cols]])
#print mycols
for rows in xrange(len(ls)):
if (type(ls[rows][cols]) is str):
if ls[rows][cols] == 'unknown':
ls[rows][cols]=most_common(mycols)
return ls
remove_unknowns(my_data)
remove_unknowns(my_data)
my desired output is something like this:
my_data= [
['58', 'management', 'married', 'no'],
['44', 'technician', 'single', 'no'],
['33', 'entrepreneur', 'married', 'no'],
['47', 'blue-collar', 'married', 'no'],
['33', 'management', 'single', 'no'],
['35', 'management', 'married', 'no'],
['28', 'management', 'single', 'no'],
['42', 'entrepreneur', 'divorced', 'no'],
['58', 'retired', 'married', 'no'],
['43', 'technician', 'married', 'no']
]
Upvotes: 0
Views: 409
Reputation: 180391
If by most common you mean in all the values combined first find the most common element and then iterate over the sublists replacing any Nones with the most common word:
my_data= [['58', 'management', 'unknown', 'no'],
['44', 'technician', 'single', 'no'],
['33', 'entrepreneur', 'married', 'no'],
['47', 'blue-collar', 'married', 'no'],
['33', 'unknown', 'single', 'no'],
['35', 'management', 'unknown', 'no'],
['28', 'management', 'single', 'no'],
['42', 'entrepreneur', 'divorced', 'no'],
['58', 'retired', 'married', 'no'],
['43', 'technician', 'unknown', 'no']]
from collections import Counter
from itertools import chain
cn = Counter(chain(*my_data)).most_common(1)[0][0]
for sub in my_data:
sub[:] = [cn if s == "unknown" else s for s in sub ]
If you actually want to get the max columns wise it is a bit more involved, you need to apply the same Counter logic but use one for each column:
from collections import Counter
# range over amount of columns and get the count for all words
l = [Counter() for _ in range(4)]
for sub in my_data:
for ind, ele in enumerate(sub):
l[ind][ele] += 1
# get most common word from each Counter
l[:] = [c.most_common(1)[0][0] for c in l]
for sub in my_data:
# if word is unknown replace it using the appropriate column word
sub[:] = [ l[ind] if ele == "unknown" else ele for ind, ele in enumerate(sub)]
from pprint import pprint as pp
pp(my_data)
Which will give you:
[['58', 'management', 'married', 'no'],
['44', 'technician', 'single', 'no'],
['33', 'entrepreneur', 'married', 'no'],
['47', 'blue-collar', 'married', 'no'],
['33', 'management', 'single', 'no'],
['35', 'management', 'married', 'no'],
['28', 'management', 'single', 'no'],
['42', 'entrepreneur', 'divorced', 'no'],
['58', 'retired', 'married', 'no'],
['43', 'technician', 'married', 'no']]
In the third column, either single or married are possible replacement values as they appear an equal amount of times.
Upvotes: 1