Shankar Pandala
Shankar Pandala

Reputation: 1008

Getting counts in MultiLabelBinarizer

How can I get counts of items in MultiLabelBinarizer?

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

pd.DataFrame(mlb.fit_transform([(1,1,2), (3,3,2,5)]),columns=mlb.classes_)

Out[0]: 
   1  2  3  5
0  1  1  0  0
1  0  1  1  1

Instead of this, I want to get

Out[0]: 
   1  2  3  5
0  2  1  0  0
1  0  1  2  1

As 1 is repeated 2 times in row 1 and 3 is repeated 2 times in row 2

Upvotes: 2

Views: 488

Answers (2)

Soudipta Dutta
Soudipta Dutta

Reputation: 2162

import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer


data = [(1, 1, 2), (3, 3, 2, 5)]

# Initialize MultiLabelBinarizer and fit_transform the data
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(data)
# Get the labels (columns) from MultiLabelBinarizer
cols= labels = mlb.classes_
print(cols)#[1 2 3 5]
# Count occurrences in each row
count_matrix = np.array([[row.count(label) for label in labels] for row in data])
# Create DataFrame from the count matrix
count_df = pd.DataFrame(count_matrix, columns=labels)
print(count_df)
'''
   1  2  3  5
0  2  1  0  0
1  0  1  2  1
'''

method 2(Most Efficient) :

import numpy as np
import pandas as pd

data = [(1, 1, 2), (3, 3, 2, 5)]

row_indices = np.concatenate([np.full(len(row), i) for i, row in enumerate(data)])
'''
row_indices :
[0 0 0 1 1 1 1]
''' 

unique_data,unique_data_indices = np.unique(np.concatenate(data),return_inverse = True)
'''
unique_data :
[1 2 3 5]
unique_data_indices :
[0 0 1 2 2 1 3]
'''
count_matrix = np.zeros((len(data),len(unique_data)),dtype=int)

'''
[[0 0 0 0]
 [0 0 0 0]]
'''
np.add.at(count_matrix,(row_indices,unique_data_indices),1)

count_df = pd.DataFrame(count_matrix, columns = unique_data)
print(count_df)
'''
   1  2  3  5
0  2  1  0  0
1  0  1  2  1
'''

Another Example :

import numpy as np
import  pandas as pd

data = np.array([[2, 3, 4, 2, 0], [3, 1, 2, 4, 2], [0, 1, 3, 0, 1], [0, 1, 2, 3, 4]])

# Flatten the array to handle duplicates and create an index array
flattened_data = data.flatten()
row_indices = np.repeat(np.arange(len(data)), data.shape[1])
print(row_indices)#[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3]
#OR 
#row_indices = np.concatenate([np.full(len(row), i) for i, row in enumerate(data)])
#print(row_indices)

# Get unique labels and their positions
unique_data, unique_data_indices = np.unique(flattened_data, return_inverse=True)

# Reconstruct the count matrix
count_matrix = np.zeros((len(data), len(unique_data)), dtype=int)

np.add.at(count_matrix, (row_indices,unique_data_indices), 1)

count_df = pd.DataFrame(count_matrix, columns=unique_data)

print("Original data:")
print(data)
print("Count Matrix:")
print(count_matrix)
print("Count DataFrame:")
print(count_df)
'''
Original data:
[[2 3 4 2 0]
 [3 1 2 4 2]
 [0 1 3 0 1]
 [0 1 2 3 4]]
Count Matrix:
[[1 0 2 1 1]
 [0 1 2 1 1]
 [2 2 0 1 0]
 [1 1 1 1 1]]
Count DataFrame:
   0  1  2  3  4
0  1  0  2  1  1
1  0  1  2  1  1
2  2  2  0  1  0
3  1  1  1  1  1
'''

Another Example. Another frequent industrial requirement :

import numpy as np
import pandas as pd

data = [[1, 1, 3], [1, 2, 2, 4]]

max_len = max(len(seq) for seq in data)
#pad with a unique value (-1) to make them of equal length
padded_data = np.array( [ seq +  [-1] * (max_len - len(seq))  for seq in data])
'''
padded_data :
[[ 1  1  3 -1]
 [ 1  2  2  4]]
'''
row_indices = np.repeat(np.arange(len(data)),max_len )
'''
row_indices :
[0 0 0 0 1 1 1 1]
'''
unique_data,unique_data_indices = np.unique(padded_data,return_inverse = True)  

flattened_data = padded_data.flatten()
mask = flattened_data != -1
flattened_data = flattened_data[mask]
unique_data = unique_data[unique_data != -1]
unique_data_indices = unique_data_indices[mask]
row_indices = row_indices[mask]

binary_matrix = np.zeros((len(data),len(flattened_data)),dtype=int)
'''
binary_matrix :
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
'''
binary_matrix[row_indices, np.arange(len(flattened_data))] = 1
print(binary_matrix)
'''
binary_matrix :
[[1 1 1 0 0 0 0]
 [0 0 0 1 1 1 1]]
'''
binary_df  = pd.DataFrame(binary_matrix,columns = flattened_data)
print(binary_df )
'''
   1  1  3  1  2  2  4
0  1  1  1  0  0  0  0
1  0  0  0  1  1  1  1
'''

Upvotes: 0

mujjiga
mujjiga

Reputation: 16916

from collections import Counter

data = [(1,1,2), (3,3,2,5)]
pd.DataFrame([Counter(x) for x in data]).fillna(0)

Output:

    1       2   3       5
0   2.0     1   0.0     0.0
1   0.0     1   2.0     1.0

Upvotes: 1

Related Questions