colorless
colorless

Reputation: 47

replacing element in a 2D List

consider this list

 my_data= [
     ['58', 'management', 'unknown', 'no'],
     ['44', 'technician', 'single', 'no'],
     ['33', 'entrepreneur', 'married', 'no'],
     ['47', 'blue-collar', 'married', 'no'],
     ['33', 'unknown', 'single', 'no'],
     ['35', 'management', 'unknown', 'no'],
     ['28', 'management', 'single', 'no'],
     ['42', 'entrepreneur', 'divorced', 'no'],
     ['58', 'retired', 'married', 'no'],
     ['43', 'technician', 'unknown', 'no']
]

I want to replace those unknown values with the most common element on the list and something is wrong with my code can anyone please correct it. it seems I need to call remove_unknown() twice to make it work

def most_common(lst):
    return max(set(lst), key=lst.count)

def remove_unknowns(ls):
    mycols=[]
    for cols in range(0, 4):
        for rows in xrange(len(ls)):
            if (type(ls[rows][cols]) is str):
                mycols.extend([ls[rows][cols]])
        #print mycols
        for rows in xrange(len(ls)):
            if (type(ls[rows][cols]) is str):
                if ls[rows][cols] == 'unknown':
                    ls[rows][cols]=most_common(mycols)
    return ls
remove_unknowns(my_data)
remove_unknowns(my_data)

my desired output is something like this:

my_data= [
 ['58', 'management', 'married', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'management', 'single', 'no'],
 ['35', 'management', 'married', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'married', 'no']

]

Upvotes: 0

Views: 409

Answers (1)

Padraic Cunningham
Padraic Cunningham

Reputation: 180391

If by most common you mean in all the values combined first find the most common element and then iterate over the sublists replacing any Nones with the most common word:

my_data= [['58', 'management', 'unknown', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'unknown', 'single', 'no'],
 ['35', 'management', 'unknown', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'unknown', 'no']]

from collections import Counter
from itertools import chain

cn = Counter(chain(*my_data)).most_common(1)[0][0]
for sub in my_data:
    sub[:] = [cn if s == "unknown" else s for s in sub ]

If you actually want to get the max columns wise it is a bit more involved, you need to apply the same Counter logic but use one for each column:

from collections import Counter

# range over amount of columns and get the count for all words
l = [Counter() for _ in range(4)]
for sub in my_data:
    for ind, ele in enumerate(sub):
        l[ind][ele] += 1

# get most common word from each Counter
l[:] = [c.most_common(1)[0][0] for c in l]

for sub in my_data:
   # if word is unknown replace it using the appropriate column word
   sub[:] = [ l[ind] if ele == "unknown" else ele for ind, ele in enumerate(sub)]




from pprint import pprint as pp
pp(my_data)

Which will give you:

[['58', 'management', 'married', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'management', 'single', 'no'],
 ['35', 'management', 'married', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'married', 'no']]

In the third column, either single or married are possible replacement values as they appear an equal amount of times.

Upvotes: 1

Related Questions