|
1 | | -学习笔记 |
| 1 | +## 理论 |
| 2 | +####hashmap我认为是对于散列表的工程性实现,散列表结构如下 |
| 3 | + |
| 4 | + |
| 5 | + |
| 6 | +通过这个例子,我们可以总结出这样的规律:散列表用的就是数组支持按照下标随机访问的时候,时间复杂度是 O(1) 的特性。我们通过散列函数把元素的键值映射为下标,然后将数据存储在数组中对应下标的位置。当我们按照键值查询元素时,我们用同样的散列函数,将键值转化数组下标,从对应的数组下标的位置取数据。 |
| 7 | + |
| 8 | + 由于散列函数,有时候会出现hash出现相同值的情况,然后就会出现散列冲突(开放寻址和链表法),hashmap使用了链表法,方法如下 |
| 9 | + |
| 10 | +主要是通过hash函数,计算bucket,然后放入slot中。 |
| 11 | + |
| 12 | +以上,就是从概念层面对于hashmap的理解 |
| 13 | + |
| 14 | +但是其中会有几个问题: |
| 15 | + |
| 16 | +- 1.如何设计相对好的散列函数? |
| 17 | + |
| 18 | +- 2.随着元素增多,bucket是有限的,如何防止散列冲突导致的性能下降? |
| 19 | + |
| 20 | +- 3.上个问题的延伸,随着元素的增多,必然会扩容bucket,如何实现高效扩容和阈值判断呢? |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | + |
| 26 | +##带着上面的三个问题,看下java的hashmap代码 |
| 27 | + |
| 28 | +#### 1、我们先看下hashmap的散列函数 |
| 29 | + |
| 30 | +```java |
| 31 | +/** |
| 32 | + * Computes key.hashCode() and spreads (XORs) higher bits of hash |
| 33 | + * to lower. Because the table uses power-of-two masking, sets of |
| 34 | + * hashes that vary only in bits above the current mask will |
| 35 | + * always collide. (Among known examples are sets of Float keys |
| 36 | + * holding consecutive whole numbers in small tables.) So we |
| 37 | + * apply a transform that spreads the impact of higher bits |
| 38 | + * downward. There is a tradeoff between speed, utility, and |
| 39 | + * quality of bit-spreading. Because many common sets of hashes |
| 40 | + * are already reasonably distributed (so don't benefit from |
| 41 | + * spreading), and because we use trees to handle large sets of |
| 42 | + * collisions in bins, we just XOR some shifted bits in the |
| 43 | + * cheapest possible way to reduce systematic lossage, as well as |
| 44 | + * to incorporate impact of the highest bits that would otherwise |
| 45 | + * never be used in index calculations because of table bounds. |
| 46 | + */ |
| 47 | +static final int hash(Object key) { |
| 48 | + int h; |
| 49 | + return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16); |
| 50 | +} |
| 51 | +``` |
| 52 | + |
| 53 | +首先取key的hashCode然后和hashcode右移16位(高位用0补全)然后进行异或 |
| 54 | + |
| 55 | +#### 2、put函数 |
| 56 | + |
| 57 | + |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | +```java |
| 62 | +final V putVal(int hash, K key, V value, boolean onlyIfAbsent, |
| 63 | + boolean evict) { |
| 64 | + Node<K,V>[] tab; Node<K,V> p; int n, i; |
| 65 | + //首先把table赋值给tab 检查是不是null 如果是null 那么初始化一下(resize) |
| 66 | + //饭后返回table的长度给n |
| 67 | + if ((tab = table) == null || (n = tab.length) == 0) |
| 68 | + n = (tab = resize()).length; |
| 69 | + //(n-1)&hash作为桶的标号 |
| 70 | + //我们知道桶的数量是 2^n |
| 71 | + //2^n - 1 在二进制中表示就是全是1 按位与就比较平均的分配桶了 |
| 72 | + //但是为什么不直接用hashcode作为hash呢? |
| 73 | + //由于使用桶数量-1作为掩码,那么注定高位的hashcode无法参与运算 |
| 74 | + //我们把高16位与低16使用xor运算,让高位参与hash,减少一定的冲突 |
| 75 | + //但是如果 桶的数量不是 2^n 那么这么做hash就不那么平均了 |
| 76 | + //在这里判断是不是有节点,如果没有,那么证明这里没有Hash冲突直接创建一个节点放这里就好了 |
| 77 | + if ((p = tab[i = (n - 1) & hash]) == null){ |
| 78 | + //在这里创建一个节点赋值给第i个桶 |
| 79 | + tab[i] = newNode(hash, key, value, null); |
| 80 | + } |
| 81 | + else { |
| 82 | + //这里证明有Hash冲突了 |
| 83 | + Node<K,V> e; K k; |
| 84 | + //判断当前的hash是不是相等,然后判断key相等 |
| 85 | + //因为key是个对象,相对来讲equals运算时间更长,先比较hash,如果不相等就不用equals比较了 |
| 86 | + //如果是相同的键,那么赋值给一个临时引用,先不覆盖 |
| 87 | + if (p.hash == hash && |
| 88 | + ((k = p.key) == key || (key != null && key.equals(k)))) |
| 89 | + e = p; |
| 90 | + //这里判断当前桶里的头结点是不是TreeNode,如果是TreeNode,那就把这个节点插到红黑树里面 |
| 91 | + else if (p instanceof TreeNode) |
| 92 | + e = ((TreeNode<K,V>)p).putTreeVal(this, tab, hash, key, value); |
| 93 | + else { |
| 94 | + //如果不是Tree节点,那么说明现在还是个链表 |
| 95 | + //开始遍历 |
| 96 | + for (int binCount = 0; ; ++binCount) { |
| 97 | + //判断是不是尾节点 |
| 98 | + if ((e = p.next) == null) { |
| 99 | + //是尾节点那就直接插后面 |
| 100 | + p.next = newNode(hash, key, value, null); |
| 101 | + //判断链表多长 |
| 102 | + //如果超过阈值,那就把链表变成二叉树 |
| 103 | + if (binCount >= TREEIFY_THRESHOLD - 1) // -1 for 1st |
| 104 | + treeifyBin(tab, hash); |
| 105 | + break; |
| 106 | + } |
| 107 | + //hash相等之后在进行equals判断如果相等,证明这个已经key已经存在了break |
| 108 | + if (e.hash == hash && |
| 109 | + ((k = e.key) == key || (key != null && key.equals(k)))) |
| 110 | + break; |
| 111 | + p = e; |
| 112 | + } |
| 113 | + } |
| 114 | + //e是key所在的节点如果是null那么证明是新插的,否则覆盖引用 |
| 115 | + if (e != null) { // existing mapping for key |
| 116 | + V oldValue = e.value; |
| 117 | + if (!onlyIfAbsent || oldValue == null) |
| 118 | + //如果旧值是null或onlyIfAbsent==0那么覆盖值然后直接返回就可以了 |
| 119 | + //因为并没有改变table结构或者链表什么的 |
| 120 | + e.value = value; |
| 121 | + //这个是空的,需要子类覆盖一下 |
| 122 | + //但是绝不能改变table结构什么的 |
| 123 | + afterNodeAccess(e); |
| 124 | + return oldValue; |
| 125 | + } |
| 126 | + } |
| 127 | + //更改迭代器 |
| 128 | + ++modCount; |
| 129 | + //大于阈值 扩容 |
| 130 | + if (++size > threshold) |
| 131 | + resize(); |
| 132 | + afterNodeInsertion(evict); |
| 133 | + return null; |
| 134 | +} |
| 135 | +``` |
| 136 | + |
| 137 | + |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | +#### 3.扩容 |
| 142 | + |
| 143 | +```java |
| 144 | +/** |
| 145 | + * Initializes or doubles table size. If null, allocates in |
| 146 | + * accord with initial capacity target held in field threshold. |
| 147 | + * Otherwise, because we are using power-of-two expansion, the |
| 148 | + * elements from each bin must either stay at same index, or move |
| 149 | + * with a power of two offset in the new table. |
| 150 | + * |
| 151 | + * @return the table |
| 152 | + */ |
| 153 | + final Node<K,V>[] resize() { |
| 154 | + Node<K,V>[] oldTab = table; |
| 155 | + //如果是null那么说明是新创建的 |
| 156 | + //下面的几个变量,Cap桶容量,Thr阈值 |
| 157 | + int oldCap = (oldTab == null) ? 0 : oldTab.length; |
| 158 | + int oldThr = threshold; |
| 159 | + int newCap, newThr = 0; |
| 160 | + //先判断旧桶有没有 |
| 161 | + if (oldCap > 0) { |
| 162 | + //如果旧的长度大于最大阈值,那么把桶扩大到最大Integer.MAX,然后不会发生resize了 |
| 163 | + if (oldCap >= MAXIMUM_CAPACITY) { |
| 164 | + threshold = Integer.MAX_VALUE; |
| 165 | + return oldTab; |
| 166 | + } |
| 167 | + //桶容量*2如果比最大容量小以及旧的大于默认桶容量 |
| 168 | + else if ((newCap = oldCap << 1) < MAXIMUM_CAPACITY && |
| 169 | + oldCap >= DEFAULT_INITIAL_CAPACITY) |
| 170 | + newThr = oldThr << 1; // double threshold |
| 171 | + } |
| 172 | + //旧的没桶数量 但是有阈值 说明第一次创建 |
| 173 | + else if (oldThr > 0) // initial capacity was placed in threshold |
| 174 | + newCap = oldThr; |
| 175 | + else { // zero initial threshold signifies using defaults |
| 176 | + //没有阈值,没有旧桶容量,说明默认初始化的 |
| 177 | + newCap = DEFAULT_INITIAL_CAPACITY; |
| 178 | + newThr = (int)(DEFAULT_LOAD_FACTOR * DEFAULT_INITIAL_CAPACITY); |
| 179 | + } |
| 180 | + if (newThr == 0) { |
| 181 | + //新阈值 为0 说明需要计算阈值 |
| 182 | + float ft = (float)newCap * loadFactor; |
| 183 | + //cap*factor但是如果太大,那么就扩大到Integer.MAX_VALUE |
| 184 | + newThr = (newCap < MAXIMUM_CAPACITY && ft < (float)MAXIMUM_CAPACITY ? |
| 185 | + (int)ft : Integer.MAX_VALUE); |
| 186 | + } |
| 187 | + threshold = newThr; |
| 188 | + @SuppressWarnings({"rawtypes","unchecked"}) |
| 189 | + //按照计算好的桶的数量扩容 |
| 190 | + Node<K,V>[] newTab = (Node<K,V>[])new Node[newCap]; |
| 191 | + table = newTab; |
| 192 | + if (oldTab != null) { |
| 193 | + //把旧的桶所有东西放在新桶里面 |
| 194 | + for (int j = 0; j < oldCap; ++j) { |
| 195 | + Node<K,V> e; |
| 196 | + if ((e = oldTab[j]) != null) { |
| 197 | + //先清空旧的桶那个位置 |
| 198 | + oldTab[j] = null; |
| 199 | + //如果只有一个元素 直接放在新桶对应的位置 |
| 200 | + if (e.next == null) |
| 201 | + newTab[e.hash & (newCap - 1)] = e; |
| 202 | + //如果是树节点 |
| 203 | + else if (e instanceof TreeNode) |
| 204 | + //对应的放在新桶里面 |
| 205 | + ((TreeNode<K,V>)e).split(this, newTab, j, oldCap); |
| 206 | + else { // preserve order |
| 207 | + //链表节点 遍历,然后对应的放新桶里面 |
| 208 | + Node<K,V> loHead = null, loTail = null; |
| 209 | + Node<K,V> hiHead = null, hiTail = null; |
| 210 | + Node<K,V> next; |
| 211 | + //开始遍历链表 |
| 212 | + do { |
| 213 | + next = e.next; |
| 214 | + //这里挺有意思的 |
| 215 | + //判断 oldCap那个位置是0还是1 |
| 216 | + //如果是0 那么扩容后不变 |
| 217 | + //如果是1 那么扩容后桶是原来位置+原来桶的数量 |
| 218 | + if ((e.hash & oldCap) == 0) { |
| 219 | + if (loTail == null) |
| 220 | + loHead = e; |
| 221 | + else |
| 222 | + loTail.next = e; |
| 223 | + loTail = e; |
| 224 | + } |
| 225 | + else { |
| 226 | + if (hiTail == null) |
| 227 | + hiHead = e; |
| 228 | + else |
| 229 | + hiTail.next = e; |
| 230 | + hiTail = e; |
| 231 | + } |
| 232 | + } while ((e = next) != null); |
| 233 | + //对应的链表放进去就行 |
| 234 | + if (loTail != null) { |
| 235 | + loTail.next = null; |
| 236 | + newTab[j] = loHead; |
| 237 | + } |
| 238 | + if (hiTail != null) { |
| 239 | + hiTail.next = null; |
| 240 | + newTab[j + oldCap] = hiHead; |
| 241 | + } |
| 242 | + } |
| 243 | + } |
| 244 | + } |
| 245 | + } |
| 246 | + return newTab; |
| 247 | + } |
| 248 | +``` |
| 249 | + |
| 250 | +## 综上可以解答以上几个问题了 |
| 251 | + |
| 252 | +#### 1.hashmap采取了取模的方式进行hash |
| 253 | + |
| 254 | +#### 2.如果单个entry的slot链表过长,超过阈值,默认是8,会进行树化操作,转成红黑树,使查询变为O(n)减少性能退化 |
| 255 | + |
| 256 | +#### 3.负载因子(非空闲的bucket/总bucket)大于0.75会进行扩容,从而减少hash冲突 |
| 257 | + |
| 258 | +#### 4.hashmap有几个问题 |
| 259 | + |
| 260 | +- a.多线程链表环导致cpu飙升 |
| 261 | + |
| 262 | +- b.非线程安全 |
| 263 | + |
| 264 | +- c.如果扩容会导致,单次的插入的性能升高,对于性能要求较高的系统是无法接受的=>redis对于hashmap有此类的解决、 |
0 commit comments