Reputation: 413
Given a dictionary:
sample = {
'123': 'Foo',
'456': 'Bar',
'789': 'Hello',
'-111': 'World'
}
What is the most efficient way (approach and/or data structure) to get the closest (or less) key from a dictionary?
Note:
1. Even if key is a string, comparison should be numerical.
2. Keys can be "negative".
Example:
get_nearest_less_element(sample, '456') # returns 'Bar'
get_nearest_less_element(sample, '235') # returns 'Foo'
get_nearest_less_element(sample, '455') # returns 'Foo'
get_nearest_less_element(sample, '999') # returns 'Hello'
get_nearest_less_element(sample, '0') # returns 'World'
get_nearest_less_element(sample, '-110') # returns 'World'
get_nearest_less_element(sample, '-999') # should return an error since it is beyond the lower bound
Additional question:
Given the same data set, would a sorted OrderedDict
or a List of Tuples
or any other python data structure be a better approach?
Upvotes: 6
Views: 4801
Reputation: 5401
Given your data set, the most efficient data structure in terms of both set-up and look-up time complexity is a binary search tree, which gives you O(n log n) set-up and O(log n) look-up time complexity with O(n) space complexity.
The standard BST algorithm doesn't include your two special constraints (as I understand them)
Here is a BST implementation based on this implementation:
class Node(object):
def __init__(self, key, value, parent):
self.left = None
self.right = None
self.value = value
self.key = key
self.parent = parent
def __str__(self):
return ":".join(map(str, (self.key, self.value)))
class BinarySearchTree(object):
def __init__(self):
self.root = None
def getRoot(self):
return self.root
def __setitem__(self, key, value):
if(self.root == None):
self.root = Node(key, value, None)
else:
self._set(key, value, self.root)
def _set(self, key, value, node):
if key == node.key:
node.value = value
elif key < node.key:
if(node.left != None):
self._set(key, value, node.left)
else:
node.left = Node(key, value, node)
else:
if(node.right != None):
self._set(key, value, node.right)
else:
node.right = Node(key, value, node)
def __contains__(self, key):
return self._get(key) != None
def __getitem__(self, key):
if(self.root != None):
return self._get(key, self.root)
else:
return None
def _get(self, key, node):
if key == node.key:
return node.value
elif key < node.key and node.left != None:
return self._get(key, node.left)
elif key > node.key and node.right != None:
return self._get(key, node.right)
Here is a subclass that fulfills requirement 1:
class FuzzySearchTree(BinarySearchTree):
def _get(self, key, node):
if key == node.key:
return node.value
elif key < node.key:
if node.left != None:
return self._get(key, node.left)
else:
return self._checkMin(key, node)
else:
if node.right != None:
return self._get(key, node.right)
else:
return node.value # found the closest match that is larger
def _checkMin(self, key, node):
return node.value
To fulfill requirement 2, you need to keep track of the minimum value in the tree. You should probably do this by keeping track of the minimum value at insert time, but here is a different approach. This approach is not super efficient, but it should still be o(3 log n) == O(log n), so it's not bad. If you don't really need this, I wouldn't bother with it.
class MinBoundedFuzzySearchTree(FuzzySearchTree):
def _checkMin(self, key, node):
# Unless the value is lower than the minimum value in the tree # Not advised
next = node.parent
while next.parent != None:
next = next.parent # Go up the tree to the top
next = next.left
while next.left != None:
next = next.left # Go down the tree to the left
if next.key > key:
return None # outside the the range of the tree
# Return the max value less than the key, which is by definition the parent
return node.parent.value
Here are some pseudo-tests:
tree = BinarySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'
print "BST(456) == 'Bar': " + str(tree[456])
print "BST(235) == None: " + str(tree[235])
print "BST(455) == None: " + str(tree[455])
print "BST(999) == None: " + str(tree[999])
print "BST(0) == None: " + str(tree[0])
print "BST(123) == 'Foo': " + str(tree[123])
print "BST(-110) == None: " + str(tree[-110])
print "BST(-999) == None: " + str(tree[-999])
tree = FuzzySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'
print
print "FST(456) == 'Bar': " + str(tree[456])
print "FST(235) == 'Foo': " + str(tree[235])
print "FST(455) == 'Foo': " + str(tree[455])
print "FST(999) == 'Hello': " + str(tree[999])
print "FST(0) == 'World': " + str(tree[0])
print "FST(123) == 'Foo': " + str(tree[123])
print "FST(-110) == 'World': " + str(tree[-110])
print "FST(-999) == 'World': " + str(tree[-999])
tree = MinBoundedFuzzySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'
print
print "MBFST(456) == 'Bar': " + str(tree[456])
print "MBFST(235) == 'Foo': " + str(tree[235])
print "MBFST(455) == 'Foo': " + str(tree[455])
print "MBFST(999) == 'Hello': " + str(tree[999])
print "MBFST(0) == 'World': " + str(tree[0])
print "MBFST(123) == 'Foo': " + str(tree[123])
print "MBFST(-110) == 'World': " + str(tree[-110])
print "MBFST(-999) == None: " + str(tree[-999])
And here is what this prints:
"""
BST(456) == 'Bar': Bar
BST(235) == None: None
BST(455) == None: None
BST(999) == None: None
BST(0) == None: None
BST(123) == 'Foo': Foo
BST(-110) == None: None
BST(-999) == None: None
FST(456) == 'Bar': Bar
FST(235) == 'Foo': Foo
FST(455) == 'Foo': Foo
FST(999) == 'Hello': Hello
FST(0) == 'World': World
FST(123) == 'Foo': Foo
FST(-110) == 'World': World
FST(-999) == 'World': Foo
MBFST(456) == 'Bar': Bar
MBFST(235) == 'Foo': Foo
MBFST(455) == 'Foo': Foo
MBFST(999) == 'Hello': Hello
MBFST(0) == 'World': World
MBFST(123) == 'Foo': Foo
MBFST(-110) == 'World': World
MBFST(-999) == None: None
"""
Upvotes: 2
Reputation: 866
If you are creating or updating the sample only once or infrequently, but looking up values repeatedly, it would be most efficient to precompute a sorted numeric list in O(n log n) time. Then the entire dictionary doesn't need to be scanned; binary search gives O(log n) access. There is a python library module function for that, bisect.
from bisect import bisect
def nearest_index(sorted_keys, elem):
idx = bisect(sorted_keys, elem)
if idx >= len(sorted_keys):
idx = len(sorted_keys) - 1
elif idx > 0:
# find closest of the two neighbors
if elem <= (sorted_keys[idx-1] + sorted_keys[idx])/2.0:
idx -= 1
return idx
sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'}
sorted_keys = sorted(int(k) for k in sample.keys())
def get_nearest_element(sample, sorted_keys, elem):
elem_int = int(elem)
idx_nearest = nearest_index(sorted_keys, elem_int)
return sample[str(sorted_keys[idx_nearest])]
for elem in ['456', '235', '455', '999']:
print get_nearest_element(sample, sorted_keys, elem)
Upvotes: 2
Reputation: 29033
def get_nearest_less_element(d, k):
k = int(k)
return d[str(max(key for key in map(int, d.keys()) if key <= k))]
Edit to update with @Paul Hankin's code, but using <=
I'm not sure it needs a branch at all. Convert all the keys to numbers, find the ones less than or equal to k, get the max one - if k is in there you'll get it, otherwise you'll get the next biggest - convert back to string and lookup in the dictionary.
Tests: https://repl.it/C2dN/0
I don't know if it's the most efficient idea; since the dictionary you get is unordered, you have to iterate over each element because any of them could be the next biggest, and since you need a numeric comparison you have to convert them all to integers. It seems to me that any other structure would take more initialisation cost as you'd have to go over every item to put it into your structure first.
But it depends on your use case - if k
is very likely to be in the dictionary, it makes sense to change my code to have if k in d: return d[k] else: ...
branch because not doing the generator expression in that case would be quicker. If it's very likely not in the dictionary, it won't help much.
A pseudocode (untested) version which does sort they keys is below - this would be slower to use once, but possibly faster to query a lot:
# cache to store sorted keys between function calls
# nb. you will have to invalidate this cache (reset to [])
# when you get a new dictionary
sorted_keys = []
def get_nearest_less_element(d, k):
if k in d: # quick return if key is in dict
return d[k]
else:
# costly sort of the keys, only do this once
if not sorted_keys:
sorted_keys = sorted(int(key) for key in d.keys())
# quick run through the sorted key list up
# to latest item less than k
k = int(k)
nearest = sorted_keys[0]
for item in sorted_keys:
if item < k:
nearest = item
else:
break
return d[str(item)]
Upvotes: 6
Reputation: 26325
I did it this way:
def get_nearest_less_element(sample, key):
try:
if key not in sample:
candidates = []
for keys in sample:
if int(keys) < int(key):
candidates.append(keys)
return sample[max(candidates)]
return sample[key]
except ValueError:
print("key is beyond lower bounds")
Upvotes: 1
Reputation: 3011
The below module returns the value if the key is present else it finds the maximum key in a list of keys that are less than the input key.
def get_nearest_less_element(sample,key):
if key in sample:
return sample[key]
else:
return sample[str(max(x for x in sample.keys() if int(x) < int(key)))]
print get_nearest_less_element(sample, '456')
print get_nearest_less_element(sample, '235')
print get_nearest_less_element(sample, '455')
print get_nearest_less_element(sample, '999')
Output:
Bar
Foo
Foo
Hello
Edit: Edited the answer based on Paul's comment.
Upvotes: 5
Reputation: 6353
Here is a solution. Finds the closest key based on numerical comparison:
sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'}
def get_nearest_less_element(inpDict, targetNum):
diff = 2**32 - 1 # Very big number.
currentKey = None
for i in sample.keys():
newDiff = abs(int(i) - targetNum)
if newDiff < diff:
currentKey = i
diff = newDiff
return inpDict[currentKey]
print(get_nearest_less_element(sample, 500))
# Prints Bar
This is just a single loop through the dictionary, so operates in O(n) time and O(1) additional space.
Upvotes: 2