Rust每周一库: LRU

今天我们来学习lru库。

LRU置换算法是一种很常用的缓存淘汰算法，称作最近最少使用（Least Recently Used）算法。缓存的大小是有限的，一旦放入对象的时候超过了缓存的容量，需要根据一个算法剔除一些对象，LRU就是一种剔除算法，它是把最近很少使用的对象剔除出去，也就是剔除最久没有访问的对象。

rust的这个库自2016开发，现在已经是0.1.17版本了，作者还在维护着。

这个库提供了丰富的方法，所以使用起来也非常的方便：

use lru::LruCache;
fn main() {
    let mut cache = LruCache::new(2);
    cache.put("apple", 3);
    cache.put("banana", 2);
    assert_eq!(*cache.get(&"apple").unwrap(), 3);
    assert_eq!(*cache.get(&"banana").unwrap(), 2);
    assert!(cache.get(&"pear").is_none());
    assert_eq!(cache.put("banana", 4), Some(2));
    assert_eq!(cache.put("pear", 5), None);
    assert_eq!(*cache.get(&"pear").unwrap(), 5);
    assert_eq!(*cache.get(&"banana").unwrap(), 4);
    assert!(cache.get(&"apple").is_none());
    {
        let v = cache.get_mut(&"banana").unwrap();
        *v = 6;
    }
    assert_eq!(*cache.get(&"banana").unwrap(), 6);
}

这个库很大程序是借鉴了rust早期标准库实现的一个std::collections::lru_cache::LruCache。

内部实现上，它使用hashbrown::HashMap存储自定义的键值对,并且实现了一个双向链表维护访问顺序。

hashbrown::HashMap是基于google的SwissTable, 它也是rust标准库1.36之后的标准库HashMap的实现，具体可以看这篇文章的介绍:。

这个库的设计清爽清晰清奇，代码量不多，很适合学习Rust编程。

这个库有几个点值的我们学习。

no-std和std的支持
Borrow trait的实现
raw pointer的广泛使用
迭代器的实现
返回 &T 和 &mut T的不同实现
PhantomData的使用
core::mem的使用

以下是我学习这个库的一些注释，方便理解它的方法的实现。

#![no_std]
#![cfg_attr(feature = "nightly", feature(alloc, optin_builtin_traits))]
// 支持`no-std` https://rust-embedded.github.io/book/intro/no-std.html
// https://os.phil-opp.com/freestanding-rust-binary/
// Google's high-performance SwissTable hash map
// 自Rust 1.36, 标准库HashMap也是同样实现.
// https://blog.waffles.space/2018/12/07/deep-dive-into-hashbrown/
extern crate hashbrown;
#[cfg(test)]
extern crate scoped_threadpool;
// 如果不使用nightly feature, 使用标准库std
#[cfg(not(feature = "nightly"))]
extern crate std as alloc;
// 提供Borrow的功能，Borrow可以通过方法返回对象的借用。
use alloc::borrow::Borrow;
use alloc::boxed::Box;
use core::hash::{BuildHasher, Hash, Hasher};
use core::iter::FusedIterator;
use core::marker::PhantomData;
use core::mem;
use core::mem::MaybeUninit;
use core::ptr;
use core::usize;
use hashbrown::hash_map::DefaultHashBuilder;
use hashbrown::HashMap;
#[cfg(test)]
#[macro_use]
extern crate std;
#[cfg(feature = "nightly")]
extern crate alloc;
// 这个struct以raw pointer的方式持有key的引用。
// 以raw pointer存储的好处是我们突破rust借用规则，在实现cache的方法的时候很有用。
#[doc(hidden)]
pub struct KeyRef<K> {
    k: *const K,
}
// 为KeyRef实现Hash、PartialEq和Eq trait，因为要用它做HashMap的key。
impl<K: Hash> Hash for KeyRef<K> {
    fn hash<H: Hasher>(&self, state: &mut H) {
        unsafe { (*self.k).hash(state) }
    }
}
impl<K: PartialEq> PartialEq for KeyRef<K> {
    fn eq(&self, other: &KeyRef<K>) -> bool {
        unsafe { (*self.k).eq(&*other.k) }
    }
}
impl<K: Eq> Eq for KeyRef<K> {}
// 为KeyRef实现Borrow trait。
// no-std情况下只为K实现了Borrow trait情况提供Borrow trait。
// std情况下返回K的借用。
#[cfg(feature = "nightly")]
#[doc(hidden)]
pub auto trait NotKeyRef {}
#[cfg(feature = "nightly")]
impl<K> !NotKeyRef for KeyRef<K> {}
#[cfg(feature = "nightly")]
impl<K, D> Borrow<D> for KeyRef<K>
where
    K: Borrow<D>,
    D: NotKeyRef + ?Sized,
{
    fn borrow(&self) -> &D {
        unsafe { (&*self.k) }.borrow()
    }
}
#[cfg(not(feature = "nightly"))]
impl<K> Borrow<K> for KeyRef<K> {
    fn borrow(&self) -> &K {
        unsafe { (&*self.k) }
    }
}
// 这个struct持有键值对，并且它还是双向链表中的一个节点，所有包含前一个节点和后一个节点的引用。
struct LruEntry<K, V> {
    key: K,
    val: V,
    prev: *mut LruEntry<K, V>,
    next: *mut LruEntry<K, V>,
}
// 可以学习下null raw pointer的生成方法。
impl<K, V> LruEntry<K, V> {
    fn new(key: K, val: V) -> Self {
        LruEntry {
            key,
            val,
            prev: ptr::null_mut(),
            next: ptr::null_mut(),
        }
    }
}
// LRU Cache的定义。
// map保存键值对，它的键和值的类型都是自定义的类型KeyRef和Box<LruEntry>。
// 链表的head和tail， 它使用辅助的head和tail作为链表的头尾.
pub struct LruCache<K, V, S = DefaultHashBuilder> {
    map: HashMap<KeyRef<K>, Box<LruEntry<K, V>>, S>,
    cap: usize,
    // head and tail are sigil nodes to faciliate inserting entries
    head: *mut LruEntry<K, V>,
    tail: *mut LruEntry<K, V>,
}
impl<K: Hash + Eq, V> LruCache<K, V> {
    // 生成容量为cap的LRU
    pub fn new(cap: usize) -> LruCache<K, V> {
        LruCache::construct(cap, HashMap::with_capacity(cap))
    }
    // 生成一个不会自动剔除的LRU。
    // 事实上是把cap设置为usize的最大值。
    pub fn unbounded() -> LruCache<K, V> {
        LruCache::construct(usize::MAX, HashMap::default())
    }
}
// 上面的方法使用DefaultHashBuilder, 下面的方法可以指定HashBuilder。
impl<K: Hash + Eq, V, S: BuildHasher> LruCache<K, V, S> {
    // 指定hasher。
    pub fn with_hasher(cap: usize, hash_builder: S) -> LruCache<K, V, S> {
        LruCache::construct(cap, HashMap::with_capacity_and_hasher(cap, hash_builder))
    }
    /// 真正的构造器。
    fn construct(cap: usize, map: HashMap<KeyRef<K>, Box<LruEntry<K, V>>, S>) -> LruCache<K, V, S> {
        // NB: The compiler warns that cache does not need to be marked as mutable if we
        // declare it as such since we only mutate it inside the unsafe block.
        // Box::into_raw把智能指针转换成raw指针，并且LRUCache负责释放对应的内存。
        // 释放内存最容易的方式是使用from_raw，把raw pointer再转成Box。
        let cache = LruCache {
            map,
            cap,
            head: unsafe {
                Box::into_raw(Box::new(
                    MaybeUninit::<LruEntry<K, V>>::uninit().assume_init(),
                ))
            },
            tail: unsafe {
                Box::into_raw(Box::new(
                    MaybeUninit::<LruEntry<K, V>>::uninit().assume_init(),
                ))
            },
        };
        // 初始化连接，头尾互指。
        unsafe {
            (*cache.head).next = cache.tail;
            (*cache.tail).prev = cache.head;
        }
        cache
    }
    // 增加一个键值。
    // 如果键已经存在缓存中，那么它会更新键的值，并且返回老的值，否则返回 None。
    pub fn put(&mut self, k: K, mut v: V) -> Option<V> {
        // 在map中查找这个节点,注意返回 *mut LruEntry<K, V>
        let node_ptr = self.map.get_mut(&KeyRef { k: &k }).map(|node| {
            let node_ptr: *mut LruEntry<K, V> = &mut **node;
            node_ptr
        });
        match node_ptr {
            Some(node_ptr) => {
                // 如果节点存在，则使用mem::swap更新它的值，然后refresh it,
                // 返回 v, v已经被替换为老的值
                unsafe { mem::swap(&mut v, &mut (*node_ptr).val) }
                self.detach(node_ptr);
                self.attach(node_ptr);
                Some(v)
            }
            None => {
                let mut node = if self.len() == self.cap() {
                    // 已经满了，需要移除最后一条记录清理空间
                    // 通过尾指针得到最后一个节点的key。
                    let old_key = KeyRef {
                        k: unsafe { &(*(*self.tail).prev).key },
                    };
                    // 从map删除老节点
                    let mut old_node = self.map.remove(&old_key).unwrap();
                    // 重用老节点
                    old_node.key = k;
                    old_node.val = v;
                    // 从智能指针转换成可修改的raw pointer
                    // 这里使用 `&mut *` 得到raw pointer 而不是 Box::into_raw，是因为下面还要用old_node
                    let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
                    self.detach(node_ptr);
                    old_node
                } else {
                    // 有容量则新建一个智能指针Box
                    Box::new(LruEntry::new(k, v))
                };
                // 转换成可修改的raw pointer, 方便增加到链表中
                let node_ptr: *mut LruEntry<K, V> = &mut *node;
                self.attach(node_ptr);
                // 将这个新值插入到map中
                let keyref = unsafe { &(*node_ptr).key };
                self.map.insert(KeyRef { k: keyref }, node);
                None
            }
        }
    }
    // 给定k的引用，返回k对应的值的引用。
    // 同时会刷新这个节点(把节点放在链表的头部)
    pub fn get<'a, Q>(&'a mut self, k: &Q) -> Option<&'a V>
    where
        KeyRef<K>: Borrow<Q>,
        Q: Hash + Eq + ?Sized,
    {
        let (node_ptr, value) = match self.map.get_mut(k) {
            None => (None, None),
            Some(node) => {
                // 解引用，解引用，转成可修改的raw pointer
                let node_ptr: *mut LruEntry<K, V> = &mut **node;
                // 为了得到val值，直接 &node.val 是不行的，因为我们还需要node刷新链表
                (Some(node_ptr), Some(unsafe { &(*node_ptr).val }))
            }
        };
        match node_ptr {
            None => (),
            Some(node_ptr) => {
                // 刷新
                self.detach(node_ptr);
                self.attach(node_ptr);
            }
        }
        value
    }
    // 返回一个可修改的值的引用, 类似get方法
    pub fn get_mut<'a>(&'a mut self, k: &K) -> Option<&'a mut V> {
        let key = KeyRef { k };
        let (node_ptr, value) = match self.map.get_mut(&key) {
            None => (None, None),
            Some(node) => {
                let node_ptr: *mut LruEntry<K, V> = &mut **node;
                (Some(node_ptr), Some(unsafe { &mut (*node_ptr).val }))
            }
        };
        match node_ptr {
            None => (),
            Some(node_ptr) => {
                self.detach(node_ptr);
                self.attach(node_ptr);
            }
        }
        value
    }
    // 类似get, 但不会刷新链表，所以实现起来简单
    pub fn peek<'a>(&'a self, k: &K) -> Option<&'a V> {
        let key = KeyRef { k };
        match self.map.get(&key) {
            None => None,
            Some(node) => Some(&node.val),
        }
    }
    // 这里是不是应该还有个peek_mut 才对称？
    pub fn peek_mut<'a>(&'a mut self, k: &K) -> Option<&'a mut V> {
        let key = KeyRef { k };
        match self.map.get_mut(&key) {
            None => None,
            Some(node) => Some(&mut node.val),
        }
    }
    // 返回最后一个节点的键值,已经是最后一个了，不需要刷新
    pub fn peek_lru<'a>(&'a self) -> Option<(&'a K, &'a V)> {
        if self.len() == 0 {
            return None;
        }
        let (key, val);
        unsafe {
            let node = (*self.tail).prev;
            key = &(*node).key;
            val = &(*node).val;
        }
        Some((key, val))
    }
    // 是否包含Key, 直接用map判断即可
    pub fn contains(&self, k: &K) -> bool {
        let key = KeyRef { k };
        self.map.contains_key(&key)
    }
    // pop 会移除一个键值，如果它存在的话
    // 事实上这是remove方法。
    // 既然从cache中删除了，那么可以返回V， 而不必是引用，因为cache不再用它了
    pub fn pop(&mut self, k: &K) -> Option<V> {
        let key = KeyRef { k };
        match self.map.remove(&key) {
            None => None,
            Some(mut old_node) => {
                let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
                // 从链表中删除
                self.detach(node_ptr);
                Some(old_node.val)
            }
        }
    }
    // 移除最后一个节点
    pub fn pop_lru(&mut self) -> Option<(K, V)> {
        let node = self.remove_last()?;
        // 不能直接解构 destructure, 原因 https://github.com/rust-lang/rust/issues/28536
        let node = *node;
        let LruEntry { key, val, .. } = node;
        Some((key, val))
    }
    // 长度，没什么好说的
    pub fn len(&self) -> usize {
        self.map.len()
    }
    // 是否为空，没什么好说的
    pub fn is_empty(&self) -> bool {
        self.map.len() == 0
    }
    // 容量，没什么好说的
    pub fn cap(&self) -> usize {
        self.cap
    }
    // 扩缩容。
    // 扩容简单，cap设成需要的值即可。
    // 缩容可能需要清理数据，通过remove_last方法一个一个清理，直到满足容量要求。
    pub fn resize(&mut self, cap: usize) {
        // return early if capacity doesn't change
        if cap == self.cap {
            return;
        }
        while self.map.len() > cap {
            self.remove_last();
        }
        self.map.shrink_to_fit();
        self.cap = cap;
    }
    // 清空数据。通过逐个删除的方式清除数据。
    // 这里是不是有更简便的办法，通过重新初始化完成清空操作(map的raw pointer需要逐个drop?)
    pub fn clear(&mut self) {
        loop {
            match self.remove_last() {
                Some(_) => (),
                None => break,
            }
        }
    }
    // 返回一个迭代器，包含长度、当前指针、结束指针，辅助字段PhantomData
    // 迭代器元素的类型是 `(&'a K, &'a V)`
    pub fn iter<'a>(&'a self) -> Iter<'a, K, V> {
        Iter {
            len: self.len(),
            ptr: unsafe { (*self.head).next },
            end: unsafe { (*self.tail).prev },
            phantom: PhantomData,
        }
    }
    // 返回一个可修改迭代器，包含长度、当前指针、结束指针，辅助字段PhantomData
    // 迭代器元素的类型是 `(&'a K, &'a mut V)`, 值可以修改。
    pub fn iter_mut<'a>(&'a mut self) -> IterMut<'a, K, V> {
        IterMut {
            len: self.len(),
            ptr: unsafe { (*self.head).next },
            end: unsafe { (*self.tail).prev },
            phantom: PhantomData,
        }
    }
    // 移除最后一个元素
    fn remove_last(&mut self) -> Option<Box<LruEntry<K, V>>> {
        let prev;
        unsafe { prev = (*self.tail).prev }
        // 如果非空
        if prev != self.head {
            let old_key = KeyRef {
                k: unsafe { &(*(*self.tail).prev).key },
            };
            // 从map中移除
            let mut old_node = self.map.remove(&old_key).unwrap();
            // 从链表中删除
            let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
            self.detach(node_ptr);
            // 返回智能指针
            Some(old_node)
        } else {
            None
        }
    }
    // 从链表中删除
    fn detach(&mut self, node: *mut LruEntry<K, V>) {
        unsafe {
            (*(*node).prev).next = (*node).next;
            (*(*node).next).prev = (*node).prev;
        }
    }
    // 加到链表的末尾
    fn attach(&mut self, node: *mut LruEntry<K, V>) {
        unsafe {
            (*node).next = (*self.head).next;
            (*node).prev = self.head;
            (*self.head).next = node;
            (*(*node).next).prev = node;
        }
    }
}
// 实现Drop
impl<K, V, S> Drop for LruCache<K, V, S> {
    fn drop(&mut self) {
        // 防止编译器尝试在head和tail中删除未初始化的字段key和val
        unsafe {
            let head = *Box::from_raw(self.head);
            let tail = *Box::from_raw(self.tail);
            let LruEntry {
                key: head_key,
                val: head_val,
                ..
            } = head;
            let LruEntry {
                key: tail_key,
                val: tail_val,
                ..
            } = tail;
            mem::forget(head_key);
            mem::forget(head_val);
            mem::forget(tail_key);
            mem::forget(tail_val);
        }
    }
}
// 传换成迭代器
impl<'a, K: Hash + Eq, V, S: BuildHasher> IntoIterator for &'a LruCache<K, V, S> {
    type Item = (&'a K, &'a V);
    type IntoIter = Iter<'a, K, V>;
    fn into_iter(self) -> Iter<'a, K, V> {
        self.iter()
    }
}
impl<'a, K: Hash + Eq, V, S: BuildHasher> IntoIterator for &'a mut LruCache<K, V, S> {
    type Item = (&'a K, &'a mut V);
    type IntoIter = IterMut<'a, K, V>;
    fn into_iter(self) -> IterMut<'a, K, V> {
        self.iter_mut()
    }
}
// 因为包含raw pointer, 所以编译器不会自动为LruCache实现`Send` 和 `Sync`。
// 但是LruCache安全的封装了raw pointer,所以我们为它实现了`Send`和`Sync`。
unsafe impl<K: Send, V: Send> Send for LruCache<K, V> {}
unsafe impl<K: Sync, V: Sync> Sync for LruCache<K, V> {}
// 迭代器的实现。
pub struct Iter<'a, K: 'a, V: 'a> {
    len: usize,
    ptr: *const LruEntry<K, V>,
    end: *const LruEntry<K, V>,
    phantom: PhantomData<&'a K>,
}
impl<'a, K, V> Iterator for Iter<'a, K, V> {
    type Item = (&'a K, &'a V);
    fn next(&mut self) -> Option<(&'a K, &'a V)> {
        if self.len == 0 {
            return None;
        }
        let key = unsafe { &(*self.ptr).key };
        let val = unsafe { &(*self.ptr).val };
        self.len -= 1;
        self.ptr = unsafe { (*self.ptr).next };
        Some((key, val))
    }
    fn size_hint(&self) -> (usize, Option<usize>) {
        (self.len, Some(self.len))
    }
    fn count(self) -> usize {
        self.len
    }
}
// 从尾端也可以迭代
impl<'a, K, V> DoubleEndedIterator for Iter<'a, K, V> {
    fn next_back(&mut self) -> Option<(&'a K, &'a V)> {
        if self.len == 0 {
            return None;
        }
        let key = unsafe { &(*self.end).key };
        let val = unsafe { &(*self.end).val };
        self.len -= 1;
        self.end = unsafe { (*self.end).prev };
        Some((key, val))
    }
}
// 因为我们有确切的数量，所以可以实现 ExactSizeIterator
impl<'a, K, V> ExactSizeIterator for Iter<'a, K, V> {}
// 返回None之后总是返回None
impl<'a, K, V> FusedIterator for Iter<'a, K, V> {}
impl<'a, K, V> Clone for Iter<'a, K, V> {
    fn clone(&self) -> Iter<'a, K, V> {
        Iter {
            len: self.len,
            ptr: self.ptr,
            end: self.end,
            phantom: PhantomData,
        }
    }
}
// 迭代器也可以实现`Send`和`Sync`。
unsafe impl<'a, K: Send, V: Send> Send for Iter<'a, K, V> {}
unsafe impl<'a, K: Sync, V: Sync> Sync for Iter<'a, K, V> {}
// 同样值可以修改的迭代器
pub struct IterMut<'a, K: 'a, V: 'a> {
    len: usize,
    ptr: *mut LruEntry<K, V>,
    end: *mut LruEntry<K, V>,
    phantom: PhantomData<&'a K>,
}
impl<'a, K, V> Iterator for IterMut<'a, K, V> {
    type Item = (&'a K, &'a mut V);
    fn next(&mut self) -> Option<(&'a K, &'a mut V)> {
        if self.len == 0 {
            return None;
        }
        let key = unsafe { &(*self.ptr).key };
        let val = unsafe { &mut (*self.ptr).val };
        self.len -= 1;
        self.ptr = unsafe { (*self.ptr).next };
        Some((key, val))
    }
    fn size_hint(&self) -> (usize, Option<usize>) {
        (self.len, Some(self.len))
    }
    fn count(self) -> usize {
        self.len
    }
}
impl<'a, K, V> DoubleEndedIterator for IterMut<'a, K, V> {
    fn next_back(&mut self) -> Option<(&'a K, &'a mut V)> {
        if self.len == 0 {
            return None;
        }
        let key = unsafe { &(*self.end).key };
        let val = unsafe { &mut (*self.end).val };
        self.len -= 1;
        self.end = unsafe { (*self.end).prev };
        Some((key, val))
    }
}
impl<'a, K, V> ExactSizeIterator for IterMut<'a, K, V> {}
impl<'a, K, V> FusedIterator for IterMut<'a, K, V> {}
unsafe impl<'a, K: Send, V: Send> Send for IterMut<'a, K, V> {}
unsafe impl<'a, K: Sync, V: Sync> Sync for IterMut<'a, K, V> {}

鸟窝

《深入理解Go并发编程》新书发售中。一书在手，并发无忧

Rust每周一库: LRU