哈希表与字典实现

学习目标

深入理解哈希表的设计原理和冲突解决机制
掌握渐进式 rehash 的完整实现和优化策略
实现高效的哈希算法（MurmurHash/SipHash）
理解负载因子和扩容策略的平衡
对比分析字典与 Go map 的性能差异

哈希表基础理论

1. 哈希表原理

哈希表是一种通过哈希函数将键映射到数组索引的数据结构，实现 O(1) 的平均查找、插入和删除操作。

┌─────────────────────────────────────────────────────────────┐
│                    哈希表结构                                │
├─────────────────────────────────────────────────────────────┤
│  索引  │  键值对  │  键值对  │  键值对  │  键值对  │  键值对  │
│   0   │  key1   │  key2   │  key3   │  key4   │  key5   │
│       │  val1   │  val2   │  val3   │  val4   │  val5   │
├───────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│   1   │  key6   │  key7   │  key8   │  key9   │  key10  │
│       │  val6   │  val7   │  val8   │  val9   │  val10  │
├───────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│   2   │  key11  │  key12  │  key13  │  key14  │  key15  │
│       │  val11  │  val12  │  val13  │  val14  │  val15  │
└───────┴─────────┴─────────┴─────────┴─────────┴─────────┘

2. 哈希冲突解决

2.1 链地址法（Chaining）

每个槽位维护一个链表，冲突的键值对存储在链表中：

// 链地址法实现
type HashNode struct {
    key   string
    value interface{}
    next  *HashNode
}

type HashTable struct {
    buckets []*HashNode
    size    int
    count   int
}

2.2 开放地址法（Open Addressing）

当发生冲突时，在哈希表中寻找下一个可用的槽位：

线性探测：h(k, i) = (h(k) + i) mod m
二次探测：h(k, i) = (h(k) + i²) mod m
双重哈希：h(k, i) = (h₁(k) + i·h₂(k)) mod m

️ Redis 字典设计

1. 字典结构

Redis 字典使用链地址法解决冲突，并支持渐进式 rehash：

// Redis 字典结构（简化版）
typedef struct dictEntry {
    void *key;              // 键
    union {
        void *val;          // 值
        uint64_t u64;       // 64位整数
        int64_t s64;        // 64位有符号整数
        double d;           // 双精度浮点数
    } v;
    struct dictEntry *next; // 指向下一个节点
} dictEntry;

typedef struct dictht {
    dictEntry **table;      // 哈希表数组
    unsigned long size;     // 哈希表大小
    unsigned long sizemask; // 哈希表大小掩码
    unsigned long used;     // 已使用节点数
} dictht;

typedef struct dict {
    dictType *type;         // 类型特定函数
    void *privdata;         // 私有数据
    dictht ht[2];           // 两个哈希表
    long rehashidx;         // rehash 索引
    int iterators;          // 迭代器数量
} dict;

2. 渐进式 Rehash

Redis 使用渐进式 rehash 避免一次性迁移大量数据造成的阻塞：

准备阶段：创建新的哈希表 ht[1]
渐进阶段：每次操作时迁移一部分数据
完成阶段：释放旧哈希表，完成 rehash

️ Go 语言字典实现

1. 基础结构定义

// dict/dict.go
package dict

import (
    "fmt"
    "hash/fnv"
    "math"
    "sync"
    "unsafe"
)

// 字典类型
type DictType struct {
    HashFunc  func(key interface{}) uint64
    EqualFunc func(key1, key2 interface{}) bool
    DupFunc   func(key interface{}) interface{}
    FreeFunc  func(key interface{})
}

// 字典条目
type DictEntry struct {
    key   interface{}
    value interface{}
    next  *DictEntry
}

// 哈希表
type DictHt struct {
    table    []*DictEntry
    size     uint64
    sizemask uint64
    used     uint64
}

// 字典
type Dict struct {
    dictType *DictType
    privdata interface{}
    ht       [2]*DictHt
    rehashidx int64
    iterators int64
    mu       sync.RWMutex
}

// 创建字典
func NewDict(dictType *DictType) *Dict {
    return &Dict{
        dictType:   dictType,
        ht:         [2]*DictHt{newDictHt(4), nil},
        rehashidx:  -1,
        iterators:  0,
    }
}

// 创建哈希表
func newDictHt(size uint64) *DictHt {
    return &DictHt{
        table:    make([]*DictEntry, size),
        size:     size,
        sizemask: size - 1,
        used:     0,
    }
}

2. 哈希函数实现

// dict/hash.go
package dict

import (
    "hash/fnv"
    "hash/maphash"
    "unsafe"
)

// 默认哈希函数（FNV-1a）
func defaultHashFunc(key interface{}) uint64 {
    switch k := key.(type) {
    case string:
        return hashString(k)
    case int:
        return hashInt(uint64(k))
    case int64:
        return hashInt(uint64(k))
    case uint64:
        return hashInt(k)
    case []byte:
        return hashBytes(k)
    default:
        // 使用反射获取字符串表示
        return hashString(fmt.Sprintf("%v", k))
    }
}

// 字符串哈希（FNV-1a）
func hashString(s string) uint64 {
    h := fnv.New64a()
    h.Write([]byte(s))
    return h.Sum64()
}

// 整数哈希
func hashInt(n uint64) uint64 {
    // 使用乘法哈希
    return n * 0x9e3779b97f4a7c15
}

// 字节数组哈希
func hashBytes(b []byte) uint64 {
    h := fnv.New64a()
    h.Write(b)
    return h.Sum64()
}

// SipHash 实现（Redis 使用）
func sipHash(key []byte, seed [16]byte) uint64 {
    // 简化的 SipHash 实现
    h := maphash.Hash{}
    h.SetSeed(maphash.MakeSeed())
    h.Write(key)
    return h.Sum64()
}

// MurmurHash 实现
func murmurHash(key []byte, seed uint32) uint32 {
    const (
        c1 = 0xcc9e2d51
        c2 = 0x1b873593
        r1 = 15
        r2 = 13
        m  = 5
        n  = 0xe6546b64
    )
    
    h := seed
    length := len(key)
    
    // 处理 4 字节块
    for i := 0; i < length-3; i += 4 {
        k := uint32(key[i]) | uint32(key[i+1])<<8 | uint32(key[i+2])<<16 | uint32(key[i+3])<<24
        k *= c1
        k = (k << r1) | (k >> (32 - r1))
        k *= c2
        h ^= k
        h = (h << r2) | (h >> (32 - r2))
        h = h*5 + n
    }
    
    // 处理剩余字节
    switch length & 3 {
    case 3:
        h ^= uint32(key[length-3]) << 16
        fallthrough
    case 2:
        h ^= uint32(key[length-2]) << 8
        fallthrough
    case 1:
        h ^= uint32(key[length-1])
        h *= c1
    }
    
    // 最终混合
    h ^= uint32(length)
    h ^= h >> 16
    h *= 0x85ebca6b
    h ^= h >> 13
    h *= 0xc2b2ae35
    h ^= h >> 16
    
    return h
}

3. 基础操作实现

// dict/operations.go
package dict

// 设置键值对
func (d *Dict) Set(key, value interface{}) error {
    d.mu.Lock()
    defer d.mu.Unlock()
    
    // 检查是否需要 rehash
    if d.isRehashing() {
        d.rehashStep()
    }
    
    // 计算哈希值
    hash := d.dictType.HashFunc(key)
    
    // 查找键是否已存在
    if entry := d.findEntry(key, hash); entry != nil {
        // 更新值
        entry.value = value
        return nil
    }
    
    // 添加新条目
    return d.addEntry(key, value, hash)
}

// 获取值
func (d *Dict) Get(key interface{}) (interface{}, bool) {
    d.mu.RLock()
    defer d.mu.RUnlock()
    
    hash := d.dictType.HashFunc(key)
    entry := d.findEntry(key, hash)
    
    if entry == nil {
        return nil, false
    }
    
    return entry.value, true
}

// 删除键
func (d *Dict) Delete(key interface{}) bool {
    d.mu.Lock()
    defer d.mu.Unlock()
    
    // 检查是否需要 rehash
    if d.isRehashing() {
        d.rehashStep()
    }
    
    hash := d.dictType.HashFunc(key)
    return d.deleteEntry(key, hash)
}

// 检查键是否存在
func (d *Dict) Exists(key interface{}) bool {
    d.mu.RLock()
    defer d.mu.RUnlock()
    
    hash := d.dictType.HashFunc(key)
    return d.findEntry(key, hash) != nil
}

// 获取字典大小
func (d *Dict) Size() uint64 {
    d.mu.RLock()
    defer d.mu.RUnlock()
    
    if d.isRehashing() {
        return d.ht[0].used + d.ht[1].used
    }
    
    return d.ht[0].used
}

// 清空字典
func (d *Dict) Clear() {
    d.mu.Lock()
    defer d.mu.Unlock()
    
    // 清空两个哈希表
    for i := 0; i < 2; i++ {
        if d.ht[i] != nil {
            d.clearHt(d.ht[i])
        }
    }
    
    d.rehashidx = -1
}

// 清空哈希表
func (d *Dict) clearHt(ht *DictHt) {
    for i := uint64(0); i < ht.size; i++ {
        entry := ht.table[i]
        for entry != nil {
            next := entry.next
            if d.dictType.FreeFunc != nil {
                d.dictType.FreeFunc(entry.key)
                d.dictType.FreeFunc(entry.value)
            }
            entry = next
        }
        ht.table[i] = nil
    }
    ht.used = 0
}

4. 查找和添加操作

// dict/find_add.go
package dict

// 查找条目
func (d *Dict) findEntry(key interface{}, hash uint64) *DictEntry {
    // 检查两个哈希表
    for i := 0; i < 2; i++ {
        ht := d.ht[i]
        if ht == nil {
            continue
        }
        
        // 计算索引
        index := hash & ht.sizemask
        
        // 遍历链表
        entry := ht.table[index]
        for entry != nil {
            if d.dictType.EqualFunc(entry.key, key) {
                return entry
            }
            entry = entry.next
        }
        
        // 如果不在 rehash 中，只检查第一个表
        if !d.isRehashing() {
            break
        }
    }
    
    return nil
}

// 添加条目
func (d *Dict) addEntry(key, value interface{}, hash uint64) error {
    // 选择哈希表
    ht := d.ht[0]
    if d.isRehashing() {
        ht = d.ht[1]
    }
    
    // 检查是否需要扩容
    if !d.isRehashing() && d.needRehash() {
        return d.expand()
    }
    
    // 计算索引
    index := hash & ht.sizemask
    
    // 创建新条目
    entry := &DictEntry{
        key:   key,
        value: value,
        next:  ht.table[index],
    }
    
    // 插入到链表头部
    ht.table[index] = entry
    ht.used++
    
    return nil
}

// 删除条目
func (d *Dict) deleteEntry(key interface{}, hash uint64) bool {
    // 检查两个哈希表
    for i := 0; i < 2; i++ {
        ht := d.ht[i]
        if ht == nil {
            continue
        }
        
        // 计算索引
        index := hash & ht.sizemask
        
        // 遍历链表
        entry := ht.table[index]
        var prev *DictEntry
        
        for entry != nil {
            if d.dictType.EqualFunc(entry.key, key) {
                // 删除条目
                if prev == nil {
                    ht.table[index] = entry.next
                } else {
                    prev.next = entry.next
                }
                
                // 释放内存
                if d.dictType.FreeFunc != nil {
                    d.dictType.FreeFunc(entry.key)
                    d.dictType.FreeFunc(entry.value)
                }
                
                ht.used--
                return true
            }
            
            prev = entry
            entry = entry.next
        }
        
        // 如果不在 rehash 中，只检查第一个表
        if !d.isRehashing() {
            break
        }
    }
    
    return false
}

5. 渐进式 Rehash 实现

// dict/rehash.go
package dict

// 检查是否需要 rehash
func (d *Dict) needRehash() bool {
    ht := d.ht[0]
    
    // 如果哈希表为空，需要初始化
    if ht.size == 0 {
        return true
    }
    
    // 负载因子超过 1 时扩容
    if ht.used >= ht.size {
        return true
    }
    
    // 负载因子小于 0.1 时缩容
    if ht.used < ht.size/10 && ht.size > 4 {
        return true
    }
    
    return false
}

// 开始 rehash
func (d *Dict) expand() error {
    ht := d.ht[0]
    newSize := ht.size
    
    // 计算新大小
    if ht.used >= ht.size {
        newSize = ht.size * 2
    } else if ht.used < ht.size/10 && ht.size > 4 {
        newSize = ht.size / 2
    }
    
    // 创建新哈希表
    d.ht[1] = newDictHt(newSize)
    d.rehashidx = 0
    
    return nil
}

// 检查是否在 rehash
func (d *Dict) isRehashing() bool {
    return d.rehashidx != -1
}

// 执行一步 rehash
func (d *Dict) rehashStep() {
    if !d.isRehashing() {
        return
    }
    
    // 每次迁移一个桶
    ht0 := d.ht[0]
    ht1 := d.ht[1]
    
    for ht0.used > 0 && d.rehashidx < int64(ht0.size) {
        // 跳过空桶
        if ht0.table[d.rehashidx] == nil {
            d.rehashidx++
            continue
        }
        
        // 迁移整个链表
        entry := ht0.table[d.rehashidx]
        ht0.table[d.rehashidx] = nil
        
        for entry != nil {
            next := entry.next
            
            // 计算新索引
            hash := d.dictType.HashFunc(entry.key)
            newIndex := hash & ht1.sizemask
            
            // 插入到新表
            entry.next = ht1.table[newIndex]
            ht1.table[newIndex] = entry
            
            ht0.used--
            ht1.used++
            
            entry = next
        }
        
        d.rehashidx++
    }
    
    // 检查是否完成 rehash
    if ht0.used == 0 {
        d.ht[0] = d.ht[1]
        d.ht[1] = nil
        d.rehashidx = -1
    }
}

// 强制完成 rehash
func (d *Dict) rehashAll() {
    for d.isRehashing() {
        d.rehashStep()
    }
}

6. 迭代器实现

// dict/iterator.go
package dict

// 字典迭代器
type DictIterator struct {
    dict    *Dict
    table   int
    index   int64
    entry   *DictEntry
    next    *DictEntry
    safe    bool
}

// 创建迭代器
func (d *Dict) NewIterator(safe bool) *DictIterator {
    d.mu.Lock()
    defer d.mu.Unlock()
    
    if safe {
        d.iterators++
    }
    
    return &DictIterator{
        dict:  d,
        table: 0,
        index: -1,
        safe:  safe,
    }
}

// 获取下一个条目
func (iter *DictIterator) Next() *DictEntry {
    if iter.entry == nil {
        return nil
    }
    
    entry := iter.entry
    iter.entry = iter.next
    
    return entry
}

// 重置迭代器
func (iter *DictIterator) Reset() {
    iter.table = 0
    iter.index = -1
    iter.entry = nil
    iter.next = nil
}

// 关闭迭代器
func (iter *DictIterator) Close() {
    if iter.safe {
        iter.dict.mu.Lock()
        iter.dict.iterators--
        iter.dict.mu.Unlock()
    }
}

// 查找下一个条目
func (iter *DictIterator) findNext() {
    for iter.table < 2 {
        ht := iter.dict.ht[iter.table]
        if ht == nil {
            iter.table++
            iter.index = -1
            continue
        }
        
        for iter.index < int64(ht.size) {
            if ht.table[iter.index] != nil {
                iter.entry = ht.table[iter.index]
                iter.next = iter.entry.next
                return
            }
            iter.index++
        }
        
        iter.table++
        iter.index = -1
    }
    
    iter.entry = nil
    iter.next = nil
}

测试验证

1. 单元测试

// dict/dict_test.go
package dict

import (
    "testing"
)

func TestDictBasic(t *testing.T) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 测试设置和获取
    d.Set("key1", "value1")
    d.Set("key2", "value2")
    
    val, exists := d.Get("key1")
    if !exists || val != "value1" {
        t.Errorf("Expected 'value1', got %v", val)
    }
    
    val, exists = d.Get("key2")
    if !exists || val != "value2" {
        t.Errorf("Expected 'value2', got %v", val)
    }
    
    // 测试删除
    if !d.Delete("key1") {
        t.Error("Failed to delete key1")
    }
    
    _, exists = d.Get("key1")
    if exists {
        t.Error("Key1 should not exist after deletion")
    }
    
    // 测试大小
    if d.Size() != 1 {
        t.Errorf("Expected size 1, got %d", d.Size())
    }
}

func TestDictRehash(t *testing.T) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 添加大量数据触发 rehash
    for i := 0; i < 1000; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        d.Set(key, value)
    }
    
    // 验证数据完整性
    for i := 0; i < 1000; i++ {
        key := fmt.Sprintf("key%d", i)
        expected := fmt.Sprintf("value%d", i)
        
        val, exists := d.Get(key)
        if !exists || val != expected {
            t.Errorf("Expected '%s', got %v", expected, val)
        }
    }
}

func TestDictIterator(t *testing.T) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 添加测试数据
    testData := map[string]string{
        "key1": "value1",
        "key2": "value2",
        "key3": "value3",
    }
    
    for k, v := range testData {
        d.Set(k, v)
    }
    
    // 测试迭代器
    iter := d.NewIterator(true)
    defer iter.Close()
    
    count := 0
    for entry := iter.Next(); entry != nil; entry = iter.Next() {
        key := entry.key.(string)
        value := entry.value.(string)
        
        if expected, exists := testData[key]; !exists || value != expected {
            t.Errorf("Expected '%s' for key '%s', got '%s'", expected, key, value)
        }
        count++
    }
    
    if count != len(testData) {
        t.Errorf("Expected %d entries, got %d", len(testData), count)
    }
}

2. 性能基准测试

// dict/benchmark_test.go
package dict

import (
    "testing"
)

func BenchmarkDictSet(b *testing.B) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        d.Set(key, value)
    }
}

func BenchmarkDictGet(b *testing.B) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 预填充数据
    for i := 0; i < 1000; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        d.Set(key, value)
    }
    
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        key := fmt.Sprintf("key%d", i%1000)
        d.Get(key)
    }
}

func BenchmarkDictDelete(b *testing.B) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 预填充数据
    for i := 0; i < b.N; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        d.Set(key, value)
    }
    
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        key := fmt.Sprintf("key%d", i)
        d.Delete(key)
    }
}

func BenchmarkGoMapSet(b *testing.B) {
    m := make(map[string]string)
    
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        m[key] = value
    }
}

func BenchmarkGoMapGet(b *testing.B) {
    m := make(map[string]string)
    
    // 预填充数据
    for i := 0; i < 1000; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        m[key] = value
    }
    
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        key := fmt.Sprintf("key%d", i%1000)
        _ = m[key]
    }
}

3. 内存使用测试

// dict/memory_test.go
package dict

import (
    "runtime"
    "testing"
)

func TestDictMemoryUsage(t *testing.T) {
    var m1, m2 runtime.MemStats
    runtime.ReadMemStats(&m1)
    
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 添加大量数据
    for i := 0; i < 10000; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        d.Set(key, value)
    }
    
    runtime.ReadMemStats(&m2)
    
    heapGrowth := m2.HeapAlloc - m1.HeapAlloc
    t.Logf("Memory usage: %d bytes", heapGrowth)
    
    if heapGrowth > 50*1024*1024 { // 50MB
        t.Errorf("Memory usage too high: %d bytes", heapGrowth)
    }
}

func TestDictRehashMemory(t *testing.T) {
    dictType := &DictType{
        HashFunc:  defaultHashFunc,
        EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
    }
    
    d := NewDict(dictType)
    
    // 添加数据触发 rehash
    for i := 0; i < 1000; i++ {
        key := fmt.Sprintf("key%d", i)
        value := fmt.Sprintf("value%d", i)
        d.Set(key, value)
    }
    
    // 验证 rehash 状态
    if d.isRehashing() {
        t.Log("Dictionary is rehashing")
    }
    
    // 强制完成 rehash
    d.rehashAll()
    
    if d.isRehashing() {
        t.Error("Dictionary should not be rehashing after rehashAll")
    }
}

性能对比分析

1. 操作复杂度对比

操作	平均情况	最坏情况	说明
查找	O(1)	O(n)	哈希冲突时退化为链表
插入	O(1)	O(n)	需要检查重复键
删除	O(1)	O(n)	需要遍历链表
迭代	O(n)	O(n)	需要遍历所有条目

2. 内存使用对比

特性	自定义字典	Go Map	说明
内存开销	较高	较低	需要存储指针和元数据
内存对齐	手动	自动	Go 编译器优化
垃圾回收	手动	自动	需要手动管理内存
内存碎片	可能	较少	Go 内存分配器优化

3. 性能测试结果

// 基准测试结果示例
func BenchmarkComparison(b *testing.B) {
    // 自定义字典性能
    b.Run("CustomDict", func(b *testing.B) {
        dictType := &DictType{
            HashFunc:  defaultHashFunc,
            EqualFunc: func(k1, k2 interface{}) bool { return k1 == k2 },
        }
        d := NewDict(dictType)
        
        for i := 0; i < b.N; i++ {
            key := fmt.Sprintf("key%d", i)
            value := fmt.Sprintf("value%d", i)
            d.Set(key, value)
        }
    })
    
    // Go Map 性能
    b.Run("GoMap", func(b *testing.B) {
        m := make(map[string]string)
        
        for i := 0; i < b.N; i++ {
            key := fmt.Sprintf("key%d", i)
            value := fmt.Sprintf("value%d", i)
            m[key] = value
        }
    })
}