python set of objects with custom hash behavior

Question

I would like to use a set to manage a collection of 'myItem' instances. The myItem class has its own hash function. The hash of these items is based on some but not all of the data in each item, for simplicity in the example below the 'data' is dictionary r. The hash takes into account 2 keys, hk1 and hk2, and there is a third key 'sad' that is not considered in the hash calculation.

class myItem():

    def __init__(self, r):
        # r is a dict holding information about the instance
        # of course r has to have certain keys...
        self.r = r

    def __hash__(self):
        """Override the default hash behavior"""
        return hash(tuple(sorted([self.r['hk1'],self.r['hk2']])))

    def __eq__(self,other):
        """checking equality"""
        if isinstance(other, self.__class__):
            return self.__hash__() == other.__hash__()
        return NotImplemented

    def __ne__(self, other):
        """checking inequality"""
        if isinstance(other, self.__class__):
            return not self.__eq__(other)
        return NotImplemented

    def __repr__(self):
        return str(self.r)

The expected behavior is confirmed by the short unit test below.

class testMySet(unittest.TestCase):

    def testMyItemstuff(self):

        m1 = myItem({'hk1':'val1', 'hk2': 100, 'sad': 'other stuff'})
        m2 = myItem({'hk1': 'val1', 'hk2': 100, 'sad': 'different other stuff'})

        self.assertEqual(m1, m2)
        self.assertNotEqual(m1.r['sad'], m2.r['sad'])

        s = { m1 }
        # add m2 to s
        s.add(m2)
        # same hash, m2 is not added
        self.assertEqual(len(s), 1)
        # set contains the original object, not the last one added
        self.assertNotEqual(s.pop().r['sad'], 'different other stuff')

My question is, how can I modify the behavior such that adding a new object whose hash coincides with an existing one ends up replacing the original one, with minimal performance impact?

jedwards · Accepted Answer

Whether defining your hash that way makes sense for your application is really for you to decide, but it does seem unlikely.

In any case, I can think of two options that will be "as fast as" a set -- O(1) instead of O(n) -- and their speed depends on implementing a hash function as you describe:

First, boil down your class and create instances:

class Item():
    def __init__(self, a, b):
        self.a = a
        self.b = b

    def __hash__(self):
        return hash(self.a)

    def __eq__(self,other):
        if isinstance(other, self.__class__):
            # Ignoring .b attribute
            return self.a == other.a
        else:
            return NotImplemented

    def __repr__(self):
        return "Item(%s, %s)" % (self.a, self.b)

i1 = Item(1,2)
i2 = Item(3,4)
i3 = Item(1,5)


print(i1 == i2)             # False (.a's don't match)
print(i1 == i3)             # True  (.a's match)

Method 1: dict values

# Using a dict
updating_set = {}
updating_set[i1] = i1       # .values(): [Item(1, 2)]
updating_set[i2] = i2       # .values(): [Item(1, 2), Item(3, 4)]
updating_set[i3] = i3       # .values(): [Item(1, 5), Item(3, 4)]

print(list(updating_set.values()))
# [Item(1, 5), Item(3, 4)]

Method 2: Using a set subclass

# Using a set subclass
class UpdatingSet(set):
    def add(self, item):
        if item in self: super().remove(item)
        super().add(item)

uset = UpdatingSet()
uset.add(i1)                # UpdatingSet({Item(1, 2)})
uset.add(i2)                # UpdatingSet({Item(1, 2), Item(3, 4)})
uset.add(i3)                # UpdatingSet({Item(1, 5), Item(3, 4)})

Bonus Method 3: Not requiring a special hash function

class NewItem():
    def __init__(self, a, b):
        self.a = a
        self.b = b

    def __repr__(self):
        return "Item(%s, %s)" % (self.a, self.b)

class ItemSet():
    def __init__(self):
        self.items = {}

    def add(self, item):
        item_hash = item.a
        self.items[item_hash] = item

    def values(self):
        return self.items.values()

i1 = NewItem(1,2)
i2 = NewItem(3,4)
i3 = NewItem(1,5)

iset = ItemSet()
iset.add(i1)                # .values(): [Item(1, 2)]
iset.add(i2)                # .values(): [Item(1, 2), Item(3, 4)]
iset.add(i3)                # .values(): [Item(1, 5), Item(3, 4)]

print(list(iset.values()))  # [Item(1, 5), Item(3, 4)]

This third approach doesn't require you to implement hash (which could cause unexpected side effects, but mimics the hashing process inside ItemSet.add(), using the "hash function" as the dictionary key.

This is probably your best bet, unless you really want to implement hash and know the extent of the effects of that decision.

python set of objects with custom hash behavior

Answers (2)

Related Questions