Reputation: 1239
I have an expected array [1,1,3]
and a predicted array [1,2,2,4]
for which I want to calculate precision_recall_fscore_support
, so I need a matrix in the following format:
>> mlb = MultiLabelBinarizerWithDuplicates()
>> transformed = mlb.fit_transform([(1, 1, 3), (1, 2, 2, 4)])
array([[1,1,0,0,1,0],
[1,0,1,1,0,1]])
>> mlb.classes_
[1,1,2,2,3,4]
For the duplicated values I don't care which one of them is turned on, meaning that this is also a valid result:
array([[1,1,0,0,1,0],
[0,1,1,1,0,1]])
MultiLabelBinarizer clearly says "All entries should be unique (cannot contain duplicate classes)" so it doesn't support this usecase.
Upvotes: 0
Views: 276
Reputation: 1239
Initial implementation that works:
import itertools
from collections import defaultdict
import copy
import numpy as np
class MultiLabelBinarizerWithDuplicates:
"""
Similar to https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
but added support for duplicated values.
"""
def __init__(self, mapping=None):
self.mapping = mapping
def fit(self, y):
unique_label_max_count = {}
for labels in y:
unique_labels = set(labels)
for unique_label in unique_labels:
max_count = unique_label_max_count.get(unique_label, [])
curr_count = [unique_label] * len([x for x in labels if x == unique_label])
if len(curr_count) > len(max_count):
unique_label_max_count[unique_label] = curr_count
self.classes_ = sorted(list(itertools.chain.from_iterable(unique_label_max_count.values())))
self.mapping = defaultdict(list)
for class_, idx in zip(self.classes_, range(len(self.classes_))):
self.mapping[class_].append(idx)
return self
def transform(self,y):
result_matrix = []
for labels in y:
mapping_copy = copy.deepcopy(self.mapping)
data = [0]*len(self.classes_)
for label in labels:
if label in mapping_copy and len(mapping_copy[label]) > 0:
relevant_idx = mapping_copy[label].pop()
data[relevant_idx] = 1
result_matrix.append(data)
return np.array(result_matrix)
def fit_transform(self,y):
return self.fit(y).transform(y)
Usage:
>> mlb = MultiLabelBinarizerWithDuplicates()
>> transformed = mlb.fit_transform([(1, 1, 3), (1, 2, 2, 4)])
array([[1,1,0,0,1,0],
[1,0,1,1,0,1]])
>> mlb.classes_
[1,1,2,2,3,4]
Upvotes: 1