Reputation: 5615
I'm trying to understand how the internals of C#'s generic Dictionary work. I decompiled and got the dictionary code. I re-wrote it slightly separating the keys/values/hashcodes to separate arrays (I omitted the enumerator and interface implementations because they're irrelevant)
using System;
using System.Linq;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
public class Dictionary<TKey, TValue> {
int[] _Buckets;
int[] _HashCodes;
int[] _Next;
int _Count;
int _FreeList;
int _FreeCount;
TKey[] _Keys;
TValue[] _Values;
readonly IEqualityComparer<TKey> _Comparer;
public int Count {
get { return _Count - _FreeCount; }
}
public TValue this[TKey key] {
get {
int index = FindIndex(key);
if (index >= 0)
return _Values[index];
throw new KeyNotFoundException(key.ToString());
}
set { Insert(key, value, false); }
}
public Dictionary() : this(0, null) {
}
public Dictionary(int capacity) : this(capacity, null) {
}
public Dictionary(IEqualityComparer<TKey> comparer) : this(0, comparer) {
}
public Dictionary(int capacity, IEqualityComparer<TKey> comparer) {
if (capacity < 0)
throw new ArgumentOutOfRangeException("capacity");
Initialize(capacity);
_Comparer = (comparer ?? EqualityComparer<TKey>.Default);
}
public Dictionary(IDictionary<TKey, TValue> dictionary) : this(dictionary, null)
{
}
public Dictionary(IDictionary<TKey, TValue> dictionary, IEqualityComparer<TKey> comparer)
: this((dictionary != null) ? dictionary.Count : 0, comparer)
{
if (dictionary == null)
throw new ArgumentNullException("dictionary");
foreach (KeyValuePair<TKey, TValue> current in dictionary)
Add(current.Key, current.Value);
}
public bool ContainsValue(TValue value) {
if (value == null) {
for (int i = 0; i < _Count; i++) {
if (_HashCodes[i] >= 0 && _Values[i] == null)
return true;
}
} else {
var defaultComparer = EqualityComparer<TValue>.Default;
for (int i = 0; i < _Count; i++) {
if (_HashCodes[i] >= 0 && defaultComparer.Equals(_Values[i], value))
return true;
}
}
return false;
}
public bool ContainsKey(TKey key) {
return FindIndex(key) >= 0;
}
public void Clear() {
if (_Count <= 0)
return;
for (int i = 0; i < _Buckets.Length; i++)
_Buckets[i] = -1;
Array.Clear(_Keys, 0, _Count);
Array.Clear(_Values, 0, _Count);
Array.Clear(_HashCodes, 0, _Count);
Array.Clear(_Next, 0, _Count);
_FreeList = -1;
_Count = 0;
_FreeCount = 0;
}
public void Add(TKey key, TValue value) {
Insert(key, value, true);
}
private void Resize(int newSize, bool forceNewHashCodes) {
int[] bucketsCopy = new int[newSize];
for (int i = 0; i < bucketsCopy.Length; i++)
bucketsCopy[i] = -1;
var keysCopy = new TKey[newSize];
var valuesCopy = new TValue[newSize];
var hashCodesCopy = new int[newSize];
var nextCopy = new int[newSize];
Array.Copy(_Values, 0, valuesCopy, 0, _Count);
Array.Copy(_Keys, 0, keysCopy, 0, _Count);
Array.Copy(_HashCodes, 0, hashCodesCopy, 0, _Count);
Array.Copy(_Next, 0, nextCopy, 0, _Count);
if (forceNewHashCodes) {
for (int i = 0; i < _Count; i++) {
if (hashCodesCopy[i] != -1)
hashCodesCopy[i] = (_Comparer.GetHashCode(keysCopy[i]) & 2147483647);
}
}
for (int i = 0; i < _Count; i++) {
int index = hashCodesCopy[i] % newSize;
nextCopy[i] = bucketsCopy[index];
bucketsCopy[index] = i;
}
_Buckets = bucketsCopy;
_Keys = keysCopy;
_Values = valuesCopy;
_HashCodes = hashCodesCopy;
_Next = nextCopy;
}
private void Resize()
{
Resize(PrimeHelper.ExpandPrime(_Count), false);
}
public bool Remove(TKey key)
{
if (key == null)
throw new ArgumentNullException("key");
int hash = _Comparer.GetHashCode(key) & 2147483647;
int index = hash % _Buckets.Length;
int num = -1;
for (int i = _Buckets[index]; i >= 0; i = _Next[i]) {
if (_HashCodes[i] == hash && _Comparer.Equals(_Keys[i], key)) {
if (num < 0)
_Buckets[index] = _Next[i];
else
_Next[num] = _Next[i];
_HashCodes[i] = -1;
_Next[i] = _FreeList;
_Keys[i] = default(TKey);
_Values[i] = default(TValue);
_FreeList = i;
_FreeCount++;
return true;
}
num = i;
}
return false;
}
private void Insert(TKey key, TValue value, bool add) {
if (key == null)
throw new ArgumentNullException("key");
if (_Buckets == null)
Initialize(0);
int hash = _Comparer.GetHashCode(key) & 2147483647;
int index = hash % _Buckets.Length;
for (int i = _Buckets[index]; i >= 0; i = _Next[i]) {
if (_HashCodes[i] == hash && _Comparer.Equals(_Keys[i], key)) {
if (add)
throw new ArgumentException("Key already exists: " + key);
_Values[i] = value;
return;
}
}
int num;
if (_FreeCount > 0) {
num = _FreeList;
_FreeList = _Next[num];
_FreeCount--;
} else {
if (_Count == _Keys.Length) {
Resize();
index = hash % _Buckets.Length;
}
num = _Count;
_Count++;
}
_HashCodes[num] = hash;
_Next[num] = _Buckets[index];
_Keys[num] = key;
_Values[num] = value;
_Buckets[index] = num;
}
private void Initialize(int capacity) {
int prime = PrimeHelper.GetPrime(capacity);
_Buckets = new int[prime];
for (int i = 0; i < _Buckets.Length; i++)
_Buckets[i] = -1;
_Keys = new TKey[prime];
_Values = new TValue[prime];
_HashCodes = new int[prime];
_Next = new int[prime];
_FreeList = -1;
}
private int FindIndex(TKey key) {
if (key == null)
throw new ArgumentNullException("key");
if (_Buckets != null) {
int hash = _Comparer.GetHashCode(key) & 2147483647;
for (int i = _Buckets[hash % _Buckets.Length]; i >= 0; i = _Next[i]) {
if (_HashCodes[i] == hash && _Comparer.Equals(_Keys[i], key))
return i;
}
}
return -1;
}
public bool TryGetValue(TKey key, out TValue value) {
int index = FindIndex(key);
if (index >= 0) {
value = _Values[index];
return true;
}
value = default(TValue);
return false;
}
public TValue ValueOrDefault(TKey key) {
return ValueOrDefault(key, default(TValue));
}
public TValue ValueOrDefault(TKey key, TValue defaultValue) {
//return this[key, defaultValue];
int index = FindIndex(key);
if (index >= 0)
return _Values[index];
return defaultValue;
}
private static class PrimeHelper {
public static readonly int[] Primes = new int[] {
3, 7, 11, 17,
23, 29, 37, 47,
59, 71, 89, 107,
131, 163, 197, 239,
293, 353, 431, 521,
631, 761, 919, 1103,
1327, 1597, 1931, 2333,
2801, 3371, 4049, 4861,
5839, 7013, 8419, 10103,
12143, 14591, 17519, 21023,
25229, 30293, 36353, 43627,
52361, 62851, 75431, 90523,
108631, 130363, 156437, 187751,
225307, 270371, 324449, 389357,
467237, 560689, 672827, 807403,
968897, 1162687, 1395263, 1674319,
2009191, 2411033, 2893249, 3471899,
4166287, 4999559, 5999471, 7199369
};
public static bool IsPrime(int candidate) {
if ((candidate & 1) != 0) {
int num = (int)Math.Sqrt((double)candidate);
for (int i = 3; i <= num; i += 2) {
if (candidate % i == 0) {
return false;
}
}
return true;
}
return candidate == 2;
}
public static int GetPrime(int min) {
if (min < 0)
throw new ArgumentException("min < 0");
for (int i = 0; i < PrimeHelper.Primes.Length; i++) {
int prime = PrimeHelper.Primes[i];
if (prime >= min)
return prime;
}
for (int i = min | 1; i < 2147483647; i += 2) {
if (PrimeHelper.IsPrime(i) && (i - 1) % 101 != 0)
return i;
}
return min;
}
public static int ExpandPrime(int oldSize) {
int num = 2 * oldSize;
if (num > 2146435069 && 2146435069 > oldSize) {
return 2146435069;
}
return PrimeHelper.GetPrime(num);
}
}
}
What's confusing me is:
_FreeList
and _FreeCount
variables: what do they represent?Count
and _Count
?_Next
seems to be related to hash collision, does it store the next index of the key/value that has the same hash?num
doing in Remove
?Key
we compute Index = GetHashCode(Key) % Buckets.Length;
which gives us an index that we could use in the _Keys/_Values/_HashCodes
arrays. Why do we need to compare _HashCodes[i] == hash
? I mean we got the index by computing the hash, so of course _Keys[i]
hash the same hash right?Note that there was an actual use case for decompiling the code and using it. Unity3D doesn't know how to serialize dictionaries and the ways you get around it normally is not so great, so I took the actual code of Dictionary and annotated the necessary fields with [SerializeField]
, now Unity can serialize the class without any wrappers.
Upvotes: 2
Views: 508
Reputation: 726609
The implementation that you show uses arrays _Keys
to store keys and _Values
to store their corresponding values. Only the portion of these arrays from index zero, inclusive, to index _Count
, exclusive, is occupied. The elements at _Count
and above are not used.
_Keys
and _Values
arrays are populated sequentially on addition. When a removal occurs, the corresponding elements are placed on free list. When the next addition occurs, the last element returned to free list is used before considering the expansion to the element at _Count
.
_Next
seems to be related to hash collision, does it store the next index of the key/value that has the same hash?
_Next
has two purposes, depending on whether the corresponding item is used or is on a free list.
i
is used, _Next[i]
represents the index of the next element with colliding hash code (it is not necessarily the same hash code!)i
is on a free list, _Next[i]
represents the next item on the free list (i.e. the item that was placed on free list before i
-th, if any).What's the difference between
Count
and_Count
?
_Count
is the high watermark of the _Keys
and _Values
arrays; Count
is the actual count, which is computed as high watermark less the number of items placed on the free list (i.e. max added minus currently removed).
So given a Key we compute
Index = GetHashCode(Key) % Buckets.Length;
which gives us an index that we could use in the_Keys
/_Values
/_HashCodes
arrays. Why do we need to compare_HashCodes[i] == hash
?
Because GetHashCode(Key) % Buckets.Length
remainder may be the same for different values of hash code. You can optimize the code by comparing hash code before invoking Equals
, although this step is not necessary.
What's
num
doing inRemove
?
An item that we are removing may be part of a list of items with colliding hash codes. The item may be at the head of the list, or it may be somewhere else on the list. The code uses num
to treat these situations differently:
_Buckets[]
_Next[]
index needs to be adjusted without promoting to the head.next
starts at a negative number, and switches to a non-negative number after the initial iteration. The check if (num < 0)
means "are we on the first iteration?"
Upvotes: 4