Reputation: 1008
How can I get counts of items in MultiLabelBinarizer?
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
pd.DataFrame(mlb.fit_transform([(1,1,2), (3,3,2,5)]),columns=mlb.classes_)
Out[0]:
1 2 3 5
0 1 1 0 0
1 0 1 1 1
Instead of this, I want to get
Out[0]:
1 2 3 5
0 2 1 0 0
1 0 1 2 1
As 1 is repeated 2 times in row 1 and 3 is repeated 2 times in row 2
Upvotes: 2
Views: 488
Reputation: 2162
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
data = [(1, 1, 2), (3, 3, 2, 5)]
# Initialize MultiLabelBinarizer and fit_transform the data
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(data)
# Get the labels (columns) from MultiLabelBinarizer
cols= labels = mlb.classes_
print(cols)#[1 2 3 5]
# Count occurrences in each row
count_matrix = np.array([[row.count(label) for label in labels] for row in data])
# Create DataFrame from the count matrix
count_df = pd.DataFrame(count_matrix, columns=labels)
print(count_df)
'''
1 2 3 5
0 2 1 0 0
1 0 1 2 1
'''
method 2(Most Efficient) :
import numpy as np
import pandas as pd
data = [(1, 1, 2), (3, 3, 2, 5)]
row_indices = np.concatenate([np.full(len(row), i) for i, row in enumerate(data)])
'''
row_indices :
[0 0 0 1 1 1 1]
'''
unique_data,unique_data_indices = np.unique(np.concatenate(data),return_inverse = True)
'''
unique_data :
[1 2 3 5]
unique_data_indices :
[0 0 1 2 2 1 3]
'''
count_matrix = np.zeros((len(data),len(unique_data)),dtype=int)
'''
[[0 0 0 0]
[0 0 0 0]]
'''
np.add.at(count_matrix,(row_indices,unique_data_indices),1)
count_df = pd.DataFrame(count_matrix, columns = unique_data)
print(count_df)
'''
1 2 3 5
0 2 1 0 0
1 0 1 2 1
'''
Another Example :
import numpy as np
import pandas as pd
data = np.array([[2, 3, 4, 2, 0], [3, 1, 2, 4, 2], [0, 1, 3, 0, 1], [0, 1, 2, 3, 4]])
# Flatten the array to handle duplicates and create an index array
flattened_data = data.flatten()
row_indices = np.repeat(np.arange(len(data)), data.shape[1])
print(row_indices)#[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3]
#OR
#row_indices = np.concatenate([np.full(len(row), i) for i, row in enumerate(data)])
#print(row_indices)
# Get unique labels and their positions
unique_data, unique_data_indices = np.unique(flattened_data, return_inverse=True)
# Reconstruct the count matrix
count_matrix = np.zeros((len(data), len(unique_data)), dtype=int)
np.add.at(count_matrix, (row_indices,unique_data_indices), 1)
count_df = pd.DataFrame(count_matrix, columns=unique_data)
print("Original data:")
print(data)
print("Count Matrix:")
print(count_matrix)
print("Count DataFrame:")
print(count_df)
'''
Original data:
[[2 3 4 2 0]
[3 1 2 4 2]
[0 1 3 0 1]
[0 1 2 3 4]]
Count Matrix:
[[1 0 2 1 1]
[0 1 2 1 1]
[2 2 0 1 0]
[1 1 1 1 1]]
Count DataFrame:
0 1 2 3 4
0 1 0 2 1 1
1 0 1 2 1 1
2 2 2 0 1 0
3 1 1 1 1 1
'''
Another Example. Another frequent industrial requirement :
import numpy as np
import pandas as pd
data = [[1, 1, 3], [1, 2, 2, 4]]
max_len = max(len(seq) for seq in data)
#pad with a unique value (-1) to make them of equal length
padded_data = np.array( [ seq + [-1] * (max_len - len(seq)) for seq in data])
'''
padded_data :
[[ 1 1 3 -1]
[ 1 2 2 4]]
'''
row_indices = np.repeat(np.arange(len(data)),max_len )
'''
row_indices :
[0 0 0 0 1 1 1 1]
'''
unique_data,unique_data_indices = np.unique(padded_data,return_inverse = True)
flattened_data = padded_data.flatten()
mask = flattened_data != -1
flattened_data = flattened_data[mask]
unique_data = unique_data[unique_data != -1]
unique_data_indices = unique_data_indices[mask]
row_indices = row_indices[mask]
binary_matrix = np.zeros((len(data),len(flattened_data)),dtype=int)
'''
binary_matrix :
[[0 0 0 0 0 0 0]
[0 0 0 0 0 0 0]]
'''
binary_matrix[row_indices, np.arange(len(flattened_data))] = 1
print(binary_matrix)
'''
binary_matrix :
[[1 1 1 0 0 0 0]
[0 0 0 1 1 1 1]]
'''
binary_df = pd.DataFrame(binary_matrix,columns = flattened_data)
print(binary_df )
'''
1 1 3 1 2 2 4
0 1 1 1 0 0 0 0
1 0 0 0 1 1 1 1
'''
Upvotes: 0
Reputation: 16916
from collections import Counter
data = [(1,1,2), (3,3,2,5)]
pd.DataFrame([Counter(x) for x in data]).fillna(0)
Output:
1 2 3 5
0 2.0 1 0.0 0.0
1 0.0 1 2.0 1.0
Upvotes: 1