Mico
Mico

Reputation: 413

Python: How to get the closest key from the given key?

Given a dictionary:

sample = {
    '123': 'Foo', 
    '456': 'Bar', 
    '789': 'Hello', 
    '-111': 'World'
}

What is the most efficient way (approach and/or data structure) to get the closest (or less) key from a dictionary?

Note:
1. Even if key is a string, comparison should be numerical.
2. Keys can be "negative".

Example:

get_nearest_less_element(sample, '456') # returns 'Bar'
get_nearest_less_element(sample, '235') # returns 'Foo'
get_nearest_less_element(sample, '455') # returns 'Foo'
get_nearest_less_element(sample, '999') # returns 'Hello'
get_nearest_less_element(sample, '0') # returns 'World'
get_nearest_less_element(sample, '-110') # returns 'World'
get_nearest_less_element(sample, '-999') # should return an error since it is beyond the lower bound

Additional question:
Given the same data set, would a sorted OrderedDict or a List of Tuples or any other python data structure be a better approach?

Upvotes: 6

Views: 4801

Answers (6)

dantiston
dantiston

Reputation: 5401

Given your data set, the most efficient data structure in terms of both set-up and look-up time complexity is a binary search tree, which gives you O(n log n) set-up and O(log n) look-up time complexity with O(n) space complexity.

The standard BST algorithm doesn't include your two special constraints (as I understand them)

  1. return the value for the maximum key <= the specified key
  2. bound the search space between the minimum and maximum values in the map

Here is a BST implementation based on this implementation:

class Node(object):

    def __init__(self, key, value, parent):
        self.left = None
        self.right = None
        self.value = value
        self.key = key
        self.parent = parent

    def __str__(self):
        return ":".join(map(str, (self.key, self.value)))



class BinarySearchTree(object):

    def __init__(self):
        self.root = None


    def getRoot(self):
        return self.root


    def __setitem__(self, key, value):
        if(self.root == None):
            self.root = Node(key, value, None)
        else:
            self._set(key, value, self.root)


    def _set(self, key, value, node):
        if key == node.key:
            node.value = value
        elif key < node.key:
            if(node.left != None):
                self._set(key, value, node.left)
            else:
                node.left = Node(key, value, node)
        else:
            if(node.right != None):
                self._set(key, value, node.right)
            else:
                node.right = Node(key, value, node)


    def __contains__(self, key):
        return self._get(key) != None


    def __getitem__(self, key):
        if(self.root != None):
            return self._get(key, self.root)
        else:
            return None


    def _get(self, key, node):
        if key == node.key:
            return node.value
        elif key < node.key and node.left != None:
            return self._get(key, node.left)
        elif key > node.key and node.right != None:
            return self._get(key, node.right)

Here is a subclass that fulfills requirement 1:

class FuzzySearchTree(BinarySearchTree):

    def _get(self, key, node):
        if key == node.key:
            return node.value
        elif key < node.key:
            if node.left != None:
                return self._get(key, node.left)
            else:
                return self._checkMin(key, node)
        else:
            if node.right != None:
                return self._get(key, node.right)
            else:
                return node.value # found the closest match that is larger


    def _checkMin(self, key, node):
        return node.value

To fulfill requirement 2, you need to keep track of the minimum value in the tree. You should probably do this by keeping track of the minimum value at insert time, but here is a different approach. This approach is not super efficient, but it should still be o(3 log n) == O(log n), so it's not bad. If you don't really need this, I wouldn't bother with it.

class MinBoundedFuzzySearchTree(FuzzySearchTree):

    def _checkMin(self, key, node):
        # Unless the value is lower than the minimum value in the tree # Not advised
        next = node.parent
        while next.parent != None:
            next = next.parent # Go up the tree to the top
        next = next.left
        while next.left != None:
            next = next.left # Go down the tree to the left
        if next.key > key:
            return None # outside the the range of the tree

        # Return the max value less than the key, which is by definition the parent
        return node.parent.value

Here are some pseudo-tests:

tree = BinarySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'

print "BST(456) == 'Bar': " + str(tree[456])
print "BST(235) == None: " + str(tree[235])
print "BST(455) == None: " + str(tree[455])
print "BST(999) == None: " + str(tree[999])
print "BST(0) == None: " + str(tree[0])
print "BST(123) == 'Foo': " + str(tree[123])
print "BST(-110) == None: " + str(tree[-110])
print "BST(-999) == None: " + str(tree[-999])

tree = FuzzySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'

print
print "FST(456) == 'Bar': " + str(tree[456])
print "FST(235) == 'Foo': " + str(tree[235])
print "FST(455) == 'Foo': " + str(tree[455])
print "FST(999) == 'Hello': " + str(tree[999])
print "FST(0) == 'World': " + str(tree[0])
print "FST(123) == 'Foo': " + str(tree[123])
print "FST(-110) == 'World': " + str(tree[-110])
print "FST(-999) == 'World': " + str(tree[-999])


tree = MinBoundedFuzzySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'

print
print "MBFST(456) == 'Bar': " + str(tree[456])
print "MBFST(235) == 'Foo': " + str(tree[235])
print "MBFST(455) == 'Foo': " + str(tree[455])
print "MBFST(999) == 'Hello': " + str(tree[999])
print "MBFST(0) == 'World': " + str(tree[0])
print "MBFST(123) == 'Foo': " + str(tree[123])
print "MBFST(-110) == 'World': " + str(tree[-110])
print "MBFST(-999) == None: " + str(tree[-999])

And here is what this prints:

"""
BST(456) == 'Bar': Bar
BST(235) == None: None
BST(455) == None: None
BST(999) == None: None
BST(0) == None: None
BST(123) == 'Foo': Foo
BST(-110) == None: None
BST(-999) == None: None

FST(456) == 'Bar': Bar
FST(235) == 'Foo': Foo
FST(455) == 'Foo': Foo
FST(999) == 'Hello': Hello
FST(0) == 'World': World
FST(123) == 'Foo': Foo
FST(-110) == 'World': World
FST(-999) == 'World': Foo

MBFST(456) == 'Bar': Bar
MBFST(235) == 'Foo': Foo
MBFST(455) == 'Foo': Foo
MBFST(999) == 'Hello': Hello
MBFST(0) == 'World': World
MBFST(123) == 'Foo': Foo
MBFST(-110) == 'World': World
MBFST(-999) == None: None
"""

Upvotes: 2

stefan.schroedl
stefan.schroedl

Reputation: 866

If you are creating or updating the sample only once or infrequently, but looking up values repeatedly, it would be most efficient to precompute a sorted numeric list in O(n log n) time. Then the entire dictionary doesn't need to be scanned; binary search gives O(log n) access. There is a python library module function for that, bisect.

from bisect import bisect

def nearest_index(sorted_keys, elem):
   idx = bisect(sorted_keys, elem)
   if idx >= len(sorted_keys):
     idx = len(sorted_keys) - 1
   elif idx > 0:
     # find closest of the two neighbors
     if elem <= (sorted_keys[idx-1] + sorted_keys[idx])/2.0:
       idx -= 1
   return idx

sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'}
sorted_keys = sorted(int(k) for k in sample.keys())

def get_nearest_element(sample, sorted_keys, elem):
  elem_int = int(elem)
  idx_nearest = nearest_index(sorted_keys, elem_int)
  return sample[str(sorted_keys[idx_nearest])]

for elem in ['456', '235', '455', '999']:
  print get_nearest_element(sample, sorted_keys, elem)

Upvotes: 2

TessellatingHeckler
TessellatingHeckler

Reputation: 29033

def get_nearest_less_element(d, k):
    k = int(k)
    return d[str(max(key for key in map(int, d.keys()) if key <= k))]

Edit to update with @Paul Hankin's code, but using <= I'm not sure it needs a branch at all. Convert all the keys to numbers, find the ones less than or equal to k, get the max one - if k is in there you'll get it, otherwise you'll get the next biggest - convert back to string and lookup in the dictionary.

Tests: https://repl.it/C2dN/0

I don't know if it's the most efficient idea; since the dictionary you get is unordered, you have to iterate over each element because any of them could be the next biggest, and since you need a numeric comparison you have to convert them all to integers. It seems to me that any other structure would take more initialisation cost as you'd have to go over every item to put it into your structure first.

But it depends on your use case - if k is very likely to be in the dictionary, it makes sense to change my code to have if k in d: return d[k] else: ... branch because not doing the generator expression in that case would be quicker. If it's very likely not in the dictionary, it won't help much.

A pseudocode (untested) version which does sort they keys is below - this would be slower to use once, but possibly faster to query a lot:

# cache to store sorted keys between function calls
# nb. you will have to invalidate this cache (reset to []) 
# when you get a new dictionary
sorted_keys = []

def get_nearest_less_element(d, k):
    if k in d:               # quick return if key is in dict
        return d[k]
    else:

        # costly sort of the keys, only do this once
        if not sorted_keys:
            sorted_keys = sorted(int(key) for key in d.keys())

        # quick run through the sorted key list up
        # to latest item less than k
        k = int(k)
        nearest = sorted_keys[0]
        for item in sorted_keys:
            if item < k:
                nearest = item
            else:
                break

         return d[str(item)]

Upvotes: 6

RoadRunner
RoadRunner

Reputation: 26325

I did it this way:

def get_nearest_less_element(sample, key):
    try:
        if key not in sample:
            candidates = []
            for keys in sample:
                if int(keys) < int(key):
                    candidates.append(keys)
            return sample[max(candidates)]
        return sample[key]
    except ValueError:
        print("key is beyond lower bounds") 

Upvotes: 1

gaganso
gaganso

Reputation: 3011

The below module returns the value if the key is present else it finds the maximum key in a list of keys that are less than the input key.

def get_nearest_less_element(sample,key):
  if key in sample:
    return sample[key]
  else:
    return sample[str(max(x for x in sample.keys() if int(x) < int(key)))]

print get_nearest_less_element(sample, '456')
print get_nearest_less_element(sample, '235')
print get_nearest_less_element(sample, '455')
print get_nearest_less_element(sample, '999')

Output:

Bar

Foo

Foo

Hello

Edit: Edited the answer based on Paul's comment.

Upvotes: 5

Daniel Porteous
Daniel Porteous

Reputation: 6353

Here is a solution. Finds the closest key based on numerical comparison:

sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'}

def get_nearest_less_element(inpDict, targetNum):
    diff = 2**32 - 1 # Very big number.
    currentKey = None
    for i in sample.keys():
        newDiff = abs(int(i) - targetNum)
        if newDiff < diff:
            currentKey = i
            diff = newDiff
    return inpDict[currentKey]

print(get_nearest_less_element(sample, 500))
# Prints Bar

This is just a single loop through the dictionary, so operates in O(n) time and O(1) additional space.

Upvotes: 2

Related Questions