Skip to content

Latest commit

 

History

History
 
 

README.md

学习笔记 第九周

第9周 第17课 | 布隆过滤器和LRU缓存

1.布隆过滤器的实现及应用

布隆过滤器 vs 哈希表(Bloom Filter vs Hash Table)

一个很长的二进制向量和一些列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。 优缺点 优点是空间效率和查询时间都远远超过一般的算法。 缺点是有一定的误识别率和删除困难。

基本性质

一个元素如果再布隆过滤器中查询结果为不存在,那么该元素一定不存在,如果查询结果为存在,那么这个元素不一定在结果中存在。 由于这个性质,布隆过滤器一般都是放在最外面当一个缓存来使用的,用于快速过滤掉不满足条件的数据。

案例

  1. 比特币网络
  2. 分布式系统(Map-Reduce)—— Hadoop、search engine
  3. Redis缓存
  4. 垃圾邮件、评论等的过滤
Python实现
from bitarray import bitarray
import mmh3
class BloomFilter:
    def __init__(self, size, hash_num):
        self.size = size
        self.hash_num = hash_num
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)
    
    def add(self, s):
        for seed in range(self.hash_num):
            result = mmh3.hash(s, seed) % self.size
            self.bit_array[result] - 1
    
    def lookup(self, s):
        for seed in range(self.hash_num):
            result = mmh3.hash(s, seed) % self.size
            if self.bit_array[result] == 0:
                return "Nope"
        return "Probably"
        
bf = BloomFilter(500000, 7)
bf.add("dantezhao")
print (bf.lookup("dantezhao")
print (bf.lookup("yyj")
Java实现

Java实现 1

public class BloomFilter implements Cloneable {
  private BitSet hashes;
  private RandomInRange prng;
  private int k; // Number of hash functions
  private static final double LN2 = 0.6931471805599453; // ln(2)

  /**
   * Create a new bloom filter.
   * @param n Expected number of elements
   * @param m Desired size of the container in bits
   **/
  public BloomFilter(int n, int m) {
    k = (int) Math.round(LN2 * m / n);
    if (k <= 0) k = 1;
    this.hashes = new BitSet(m);
    this.prng = new RandomInRange(m, k);
  }

  /**
   * Create a bloom filter of 1Mib.
   * @param n Expected number of elements
   **/
  public BloomFilter(int n) {
    this(n, 1024*1024*8);
  }

  /**
  * Add an element to the container
  **/
  public void add(Object o) {
    prng.init(o);
    for (RandomInRange r : prng) hashes.set(r.value);
  }
  /** 
  * If the element is in the container, returns true.
  * If the element is not in the container, returns true with a probability ≈ e^(-ln(2)² * m/n), otherwise false.
  * So, when m is large enough, the return value can be interpreted as:
  *    - true  : the element is probably in the container
  *    - false : the element is definitely not in the container
  **/
  public boolean contains(Object o) {
    prng.init(o);
    for (RandomInRange r : prng)
      if (!hashes.get(r.value))
        return false;
    return true;
  }

  /**
   * Removes all of the elements from this filter.
   **/
  public void clear() {
    hashes.clear();
  }

  /**
   * Create a copy of the current filter
   **/
  public BloomFilter clone() throws CloneNotSupportedException {
    return (BloomFilter) super.clone();
  }

  /**
   * Generate a unique hash representing the filter
   **/
  public int hashCode() {
    return hashes.hashCode() ^ k;
  }

  /**
   * Test if the filters have equal bitsets.
   * WARNING: two filters may contain the same elements, but not be equal
   * (if the filters have different size for example).
   */
  public boolean equals(BloomFilter other) {
    return this.hashes.equals(other.hashes) && this.k == other.k;
  }

  /**
   * Merge another bloom filter into the current one.
   * After this operation, the current bloom filter contains all elements in
   * other.
   **/
  public void merge(BloomFilter other) {
    if (other.k != this.k || other.hashes.size() != this.hashes.size()) {
      throw new IllegalArgumentException("Incompatible bloom filters");
    }
    this.hashes.or(other.hashes);
  }

  private class RandomInRange
      implements Iterable<RandomInRange>, Iterator<RandomInRange> {

    private Random prng;
    private int max; // Maximum value returned + 1
    private int count; // Number of random elements to generate
    private int i = 0; // Number of elements generated
    public int value; // The current value

    RandomInRange(int maximum, int k) {
      max = maximum;
      count = k;
      prng = new Random();
    }
    public void init(Object o) {
      prng.setSeed(o.hashCode());
    }
    public Iterator<RandomInRange> iterator() {
      i = 0;
      return this;
    }
    public RandomInRange next() {
      i++;
      value = prng.nextInt() % max;
      if (value<0) value = -value;
      return this;
    }
    public boolean hasNext() {
      return i < count;
    }
    public void remove() {
      throw new UnsupportedOperationException();
    }
  }
}

Java实现 2 Orestes-Bloomfilter

2.LRU Cache的实现、应用和题解

Cache缓存
  1. 记忆
LRUCache

LRU:(Least Recently Used)最近最少使用

  • 两个要素:大小、替换策略
  • HashTable + DoubleLinkedList
  • 查询:O(1)
  • 修改、更新:O(1)

Python实现

class LRUCache(object):
    
    def __init__(self, capacity):
        self.dic = collections.OrderedDict()
        self.remian = capacity
        
    def get(self, key):
        if key not in self.dic:
            return -1;
        v = self.dic.pop(key)
        self.dic[key] = v
        return v
        
    def put(self, key, value):
        if key in self.dic:
            self.dic.pop(key)
        else:
            if self.remian > 0:
                self.remain -= 1
            else:
                self.dic.popitem(last=False)
        self.dic[key] = value  

Java实现

public class LRUCache {
    private Map<Integer, Integer> map;
    
    public LRUCache(int capacity) {
        map = new LinkedCappedHashMap<>(capacity);
    }
    
    public int get(int key) {
        if (!map.containsKey(key)) return -1;
        return map.get(key);
    }
    
    public void put(int key, int value) {
        map.put(key, value);
    }
    
    private static class LinkedCappedHashMap<K, V> extends LinkedHashMap<K, V> {
        int maximumCapacity;
        
        LinkedCappedHashMap(int maximumCapacity) {
            super(16, 0.75f, true);
            this.maximumCapacity = maximumCapacity;
        }
        
        protected boolean removeEldestEntry(Map.Entry eldest) {
            return size() > maximumCapacity;
        }
    }
    
}

第9周 第18课 | 排序算法

1.初级排序和高级排序的实现和特性

排序算法

  1. 比较类排序 通过比较来决定元素之间的相对次序,由于其时间复杂度不能突破O(nlogn),因此也成为非线性时间比较类排序。

    • 交换排序
      • 冒泡排序
      • 快速排序
    • 插入排序
      • 简单插入排序
      • 希尔排序
    • 选择排序
      • 简单选择排序
      • 堆排序
    • 归并排序
      • 二路归并排序
      • 多路归并排序
  2. 非比较类排序 不通过比较来决定元素的相对次序,它可以突破基于比较排序的时间下界,以线性时间运行,因此也称为线性时间非比较类排序。

    • 计数排序
    • 桶排序
    • 基数排序

重点: 堆排序,快速排序,归并排序

初级排序-O(n^2)

  1. 选择排序(Selection Sort) 每次找最小值,然后放到待排序数组的起始位置
  2. 插入排序(Insertion Sort) 从前到后逐步构建有序序列:对于未排序数据,在已排序序列中从后向前扫描,找到相应位置并插入。
  3. 冒泡排序(Bubble Sort) 嵌套循环,每次查看相邻的元素如果逆序,则交换。

高级排序-O(nlogn)

  • 快速排序(Quick Sort) 数组取标杆pivot,将小元素放在pivot左边,大元素放右侧,然后依次对右边和右边的子数组继续快排;已达到整个序列有序。 快速排序Java代码:
public static void quickSort(int[] array, int begin, int end) {
    if (end < begin) return;
    int pivot = partition(array, begin, end);
    quickSort(array, begin, pivot - 1);
    quickSort(array, pivot + 1, end);
}

static int partition(int[] a, int begin, int end) {
    // pivot:标杆位置,counter:小于pivot的元素的个数
    int pivot = end, counter = begin;
    for(int i = begin; i < end; i++) {
        if (a[i] < a[pivot]) {
            int temp = a[counter];
            a[counter] = a[i];
            a[i] = temp;
            counter++
        }    
    }
    int temp = a[pivot];
    a[pivot] = a[counter];
    a[counter] = temp;
    return counter;
}
  • 归并排序(Merge Sort) ——分治
  1. 把长度为n的输入序列分成两个长度为n/2的子序列;
  2. 对这两个子序列分别采用归并排序;
  3. 将两个排序好的子序列合并成一个最终的排序序列。 归并排序Java代码
    public static void mergeSort(int[] array, int left, int right) {
        if (right <= left) return;
        int mid = (left + right) >> 1; // (left + right) / 2
        
        mergeSort(array, left, mid);
        mergeSort(array, mid + 1, right);
        merge(array, left, mid, right);
    }
    public static void merge(int[] arr, int left, int mid, int right) {
        int[] temp = new int[right - left + 1]; // 中间数组
        int i = left, j = mid + 1, k = 0;
        while (i <= mid && j <= right) {
            temp[k++] = arr[i] < arr[j] ? arr[i++] : arr[j++];
        }
        
        while(i <= mid) temp[k++] = arr[i++];
        while(j <= right) temp[k++] = arr[j++];
        
        for(int p = 0; p < temp.length; p++) {
            arr[left + p] = temp[p];
        }
        // 也可以用 System.arraycopy(a, start1, b, start2, length);
    }

归并和快排具有相似性,但步骤顺序相反

归并:先排序左右子数组,然后合并两个有序子数组 快排:先调配出左右子数组,然后对于左右子数组进行排序

  • 堆排序(Heap Sort) —— 堆插入O(logN), 取最大/小值 O(1)
  1. 数组元素依次建立小顶堆
  2. 依次取堆顶元素,并删除
void heap_sort(int a[], int len) {
    priority_queue<int, vector<int>, greater<int>> q;
    for(int i = 0; i < len; i++) {
        q.push(a[i]);
    }
    for(int i = 0; i < len; i++) {
        a[i] = q.pop();
    }
    
}