1. 程式人生 > 實用技巧 >C#原始碼(十二) HashSet

C#原始碼(十二) HashSet

基礎介紹

倉儲地址

https://github.com/dotnet/runtime/

我本地的專案位置

C:\project\SourceCode\runtime-5.0.0-preview.3.20214.6\src\libraries\System.Collections

實現原理和Dictionary差不多,都是鏈地址法解決衝突。

Dictionary 有Key Value

HashSet只有Value

實際容器為Slot[] m_slots;

internal struct Slot 
{
  internal int hashCode;      // Lower 31 bits of hash code, -1 if unused
internal int next; // Index of next entry, -1 if last internal T value; }

HashSet操作元素的時間複雜度接近O(1)

定義int[] m_buckets 陣列來儲存元素在實際容器Slot[] m_slots 位置

即 Value的儲存在 m_slots[m_buckets[value.GetHashCode()%m_buckets.Length]].value

容器長度為質數

質數只能被1和自身整除

減少位置衝突

資料已滿時新增資料擴容會自動擴充當前容量的2倍

新建一個2倍大小的容器

資料拷貝過去 重新計算位置

使用優化點

已知容器大小的情況 直接初始化對應大小

自定義元素可以實現IEqualityComparer可以更高效判斷相等和獲取HashCode

雜湊函式

當位置衝突時使用Slot.next儲存資料,也就是拉鍊法解決衝突。

hashCode = value == null ? 0 : InternalGetHashCode(comparer.GetHashCode(value));

這裡comparer就是IEqualityComparer<T>? comparer = _comparer;可以是預設的,也可以建構函式傳入

InternalGetHashCode方法如下

private
static int InternalGetHashCode(T item, IEqualityComparer<T>? comparer) { if (item == null) { return 0; } int hashCode = comparer?.GetHashCode(item) ?? item.GetHashCode(); return hashCode & Lower31BitMask; }

最後通過hashCode對桶長度求餘獲取bucket

bucket = hashCode % _buckets!.Length;

內部AddIfNotPresent方法

/// <summary>
/// Adds value to MyHashSet if not contained already
/// Returns true if added and false if already present
/// </summary>
/// <param name="value">value to find</param>
/// <returns></returns>
private bool AddIfNotPresent(T value)
{
    if (_buckets == null)
    {
        Initialize(0);
    }

    int hashCode = InternalGetHashCode(value);
    int bucket = hashCode % _buckets.Length;
    int collisionCount = 0;
    //把快照儲存下來
    Slot[] slots = _slots;

    //遍歷整個鏈 _buckets[bucket]- 1 是第一個要查詢的位置  如果沒找到 i就是-1 可以一直走下一步
    for (int i = _buckets[bucket] - 1; i >= 0; i = slots[i].next)
    {
        //已存在相同的元素
        if (slots[i].hashCode == hashCode && _comparer.Equals(slots[i].value, value))
        {
            return false;
        }

        //衝突次數大於slots的長度了
        if (collisionCount >= slots.Length)
        {
            // The chain of entries forms a loop, which means a concurrent update has happened.
            throw new InvalidOperationException( );
        }
        collisionCount++;
    }

    int index;

    //獲取空閒位置
    if (_freeList >= 0)
    {
        index = _freeList;
        _freeList = slots[index].next;
    }
    else
    {
        if (_lastIndex == slots.Length)
        {
            IncreaseCapacity();
            // this will change during resize
            slots = _slots;
            bucket = hashCode % _buckets.Length;
        }
        index = _lastIndex;
        _lastIndex++;
    }
    //存入資料,記錄索引
    slots[index].hashCode = hashCode;
    slots[index].value = value;
    slots[index].next = _buckets[bucket] - 1;
    _buckets[bucket] = index + 1;
    _count++;
    _version++;

    return true;
}

HashHelpers輔助類

這裡HashHelpers是用來求素數和獲取下一次擴容的大小的輔助類,裡面有一個數組存放基礎素數,如果容量超過已有素數,會通過數學的方法計算出需要的素數。

public class HashHelpers
{
    public const uint HashCollisionThreshold = 100;

    // This is the maximum prime smaller than Array.MaxArrayLength
    public const int MaxPrimeArrayLength = 0x7FEFFFFD;
    public const int HashPrime = 101;

    private static readonly int[] s_primes =
    {
        3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919,
        1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591,
        17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437,
        187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263,
        1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369
    };

    /// <summary>
    /// 判斷是否為質數(素數)
    /// </summary>
    /// <param name="candidate"></param>
    /// <returns></returns>
    public static bool IsPrime(int candidate)
    {
        //按位與1不等於0 如果等於0那麼只能為2  不然就肯定可以被2整除
        if ((candidate & 1) != 0)
        {
            //求該數的平方根  
            int limit = (int)Math.Sqrt(candidate);
            //從3開始遍歷,一直到平方根  大於平方根的數去除肯定是1.xxxx的,不用去判斷
            //每次+2是跳過 偶數
            for (int divisor = 3; divisor <= limit; divisor += 2)
            {
                if ((candidate % divisor) == 0)
                    return false;
            }
            return true;
        }
        return candidate == 2;
    }

    /// <summary>
    /// 獲取質數
    /// </summary>
    /// <param name="min">最小值</param>
    /// <returns></returns>
    public static int GetPrime(int min)
    {
        if (min < 0)
            throw new ArgumentException();
        //遍歷已有陣列
        foreach (int prime in s_primes)
        {
            if (prime >= min)
                return prime;
        }

        // Outside of our predefined table. Compute the hard way.
        //不在陣列範圍內,進行計算  把i的最後一位與1求或   要麼不變要麼加1  因為最後一位不為1的是不能為素數的(除了2)  每次+2是跳過 偶數
        for (int i = (min | 1); i < int.MaxValue; i += 2)
        {
            if (IsPrime(i) && ((i - 1) % HashPrime != 0))
                return i;
        }
        return min;
    }

    // Returns size of hashtable to grow to.
    public static int ExpandPrime(int oldSize)
    {
        int newSize = 2 * oldSize;

        // Allow the hashtables to grow to maximum possible size (~2G elements) before encountering capacity overflow.
        // Note that this check works even when _items.Length overflowed thanks to the (uint) cast
        if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)
        {
           
            return MaxPrimeArrayLength;
        }

        return GetPrime(newSize);
    }
}

對外公有方法

/// <summary>
/// 與另一個集合合併
/// </summary>
/// <param name="other"></param>
public void UnionWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException(nameof(other));
    }

    foreach (T item in other)
    {
        AddIfNotPresent(item);
    }
}

/// <summary>
/// 刪除和other相等的項
/// </summary>
/// <param name="other"></param>
public void ExceptWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    // this is already the enpty set; return
    if (m_count == 0)
    {
        return;
    }

    // special case if other is this; a set minus itself is the empty set
    if (other == this)
    {
        Clear();
        return;
    }

    // remove every element in other from this
    foreach (T element in other)
    {
        Remove(element);
    }
}

/// <summary>
/// 修改自身 刪除存在自身和other的元素
/// </summary>
/// <param name="other"></param>
public void SymmetricExceptWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    // if set is empty, then symmetric difference is other
    if (m_count == 0)
    {
        UnionWith(other);
        return;
    }

    // special case this; the symmetric difference of a set with itself is the empty set
    if (other == this)
    {
        Clear();
        return;
    }

    MyHashSet<T> otherAsSet = other as MyHashSet<T>;
    // If other is a HashSet, it has unique elements according to its equality comparer,
    // but if they're using different equality comparers, then assumption of uniqueness
    // will fail. So first check if other is a hashset using the same equality comparer;
    // symmetric except is a lot faster and avoids bit array allocations if we can assume
    // uniqueness
    if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet))
    {
        SymmetricExceptWithUniqueHashSet(otherAsSet);
    }
    else
    {
        SymmetricExceptWithEnumerable(other);
    }
}
對外公有方法

交集和子集等判斷

/// <summary>
/// 求和other交集
/// </summary>
/// <param name="other"></param>
public void IntersectWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    if (m_count == 0)
    {
        return;
    }

    ICollection<T> otherAsCollection = other as ICollection<T>;
    if (otherAsCollection != null)
    {
        if (otherAsCollection.Count == 0)
        {
            Clear();
            return;
        }

        MyHashSet<T> otherAsSet = other as MyHashSet<T>;
        // faster if other is a hashset using same equality comparer; so check 
        // that other is a hashset using the same equality comparer.
        if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet))
        {
            IntersectWithHashSetWithSameEC(otherAsSet);
            return;
        }
    }

    IntersectWithEnumerable(other);
}

private void IntersectWithMyHashSetWithSameEC(MyHashSet<T> other)
{
    for (int i = 0; i < _lastIndex; i++)
    {
        if (_slots[i].hashCode >= 0)
        {
            T item = _slots[i].value;
            if (!other.Contains(item))
            {
                Remove(item);
            }
        }
    }
}

/// <summary>
/// Iterate over other. If contained in this, mark an element in bit array corresponding to
/// its position in _slots. If anything is unmarked (in bit array), remove it.
/// This attempts to allocate on the stack, if below StackAllocThreshold.
/// 這裡用了指標陣列等,所以是unsafe方法
/// </summary>
/// <param name="other"></param>
private unsafe void IntersectWithEnumerable(IEnumerable<T> other)
{
    // keep track of current last index; don't want to move past the end of our bit array
    // (could happen if another thread is modifying the collection)
    int originalLastIndex = _lastIndex;
    int intArrayLength = BitHelper.ToIntArrayLength(originalLastIndex);

    BitHelper bitHelper;
    //根據長度選擇不同的BitHelper例項化方式
    if (intArrayLength <= StackAllocThreshold)
    {
        int* bitArrayPtr = stackalloc int[intArrayLength];
        bitHelper = new BitHelper(bitArrayPtr, intArrayLength);
    }
    else
    {
        int[] bitArray = new int[intArrayLength];
        bitHelper = new BitHelper(bitArray, intArrayLength);
    }

    // mark if contains: find index of in slots array and mark corresponding element in bit array
    foreach (T item in other)
    {
        int index = InternalIndexOf(item);
        if (index >= 0)
        {
            bitHelper.MarkBit(index);
        }
    }

    // if anything unmarked, remove it. Perf can be optimized here if BitHelper had a 
    // FindFirstUnmarked method.
    for (int i = 0; i < originalLastIndex; i++)
    {
        if (_slots[i].hashCode >= 0 && !bitHelper.IsMarked(i))
        {
            Remove(_slots[i].value);
        }
    }
}
求和other交集
/// <summary>
/// 是否為other的子集
/// </summary>
/// <param name="other"></param>
/// <returns></returns>
public bool IsSubsetOf(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    // The empty set is a subset of any set
    if (m_count == 0)
    {
        return true;
    }

    MyHashSet<T> otherAsSet = other as MyHashSet<T>;
    // faster if other has unique elements according to this equality comparer; so check 
    // that other is a hashset using the same equality comparer.
    if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet))
    {
        // if this has more elements then it can't be a subset
        if (m_count > otherAsSet.Count)
        {
            return false;
        }

        // already checked that we're using same equality comparer. simply check that 
        // each element in this is contained in other.
        return IsSubsetOfHashSetWithSameEC(otherAsSet);
    }
    else
    {
        ElementCount result = CheckUniqueAndUnfoundElements(other, false);
        return (result.uniqueCount == m_count && result.unfoundCount >= 0);
    }
}
是否為other的子集