Skip to content

Commit 992b5d9

Browse files
author
Michael Glavassevich
committed
Enhancements to the hashing algorithms used in internal data structures within Xerces to make them more resistant to collisions. To improve the distribution a new hash function is randomly selected each time the number of items in a bucket exceeds a threshold.
git-svn-id: https://svn.apache.org/repos/asf/xerces/java/trunk@1357381 13f79535-47bb-0310-9956-ffa450edef68
1 parent aed06f9 commit 992b5d9

File tree

6 files changed

+584
-88
lines changed

6 files changed

+584
-88
lines changed

src/org/apache/xerces/impl/dtd/DTDGrammar.java

Lines changed: 144 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import java.util.ArrayList;
2121
import java.util.Hashtable;
22+
import java.util.Random;
2223

2324
import org.apache.xerces.impl.dtd.models.CMAny;
2425
import org.apache.xerces.impl.dtd.models.CMBinOp;
@@ -2639,6 +2640,26 @@ public ChildrenList () {}
26392640
* @author Andy Clark, IBM
26402641
*/
26412642
protected static final class QNameHashtable {
2643+
2644+
private static final class PrimeNumberSequenceGenerator {
2645+
2646+
private static int [] PRIMES = {
2647+
3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
2648+
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
2649+
139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
2650+
229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
2651+
317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419,
2652+
421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509,
2653+
521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617,
2654+
619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727};
2655+
2656+
static void generateSequence(int[] arrayToFill) {
2657+
Random r = new Random();
2658+
for (int i = 0; i < arrayToFill.length; ++i) {
2659+
arrayToFill[i] = PRIMES[r.nextInt(PRIMES.length)];
2660+
}
2661+
}
2662+
}
26422663

26432664
//
26442665
// Constants
@@ -2651,19 +2672,37 @@ protected static final class QNameHashtable {
26512672
// that we get a better distribution for hashing. -Ac
26522673
/** Hashtable size (101). */
26532674
private static final int HASHTABLE_SIZE = 101;
2675+
2676+
/** Maximum hash collisions per bucket for a table with load factor == 1. */
2677+
private static final int MAX_HASH_COLLISIONS = 40;
2678+
2679+
private static final int MULTIPLIERS_SIZE = 1 << 5;
2680+
private static final int MULTIPLIERS_MASK = MULTIPLIERS_SIZE - 1;
26542681

26552682
//
26562683
// Data
26572684
//
26582685
private Object[][] fHashTable = new Object[HASHTABLE_SIZE][];
2686+
2687+
/** actual table size **/
2688+
private int fTableSize = HASHTABLE_SIZE;
2689+
2690+
/** The total number of entries in the hash table. */
2691+
private int fCount = 0;
2692+
2693+
/**
2694+
* Array of randomly selected hash function multipliers or <code>null</code>
2695+
* if the default String.hashCode() function should be used.
2696+
*/
2697+
private int[] fHashMultipliers;
26592698

26602699
//
26612700
// Public methods
26622701
//
26632702
/** Associates the given value with the specified key tuple. */
26642703
public void put(String key, int value) {
26652704

2666-
int hash = (key.hashCode() & 0x7FFFFFFF) % HASHTABLE_SIZE;
2705+
int hash = (hash(key) & 0x7FFFFFFF) % fTableSize;
26672706
Object[] bucket = fHashTable[hash];
26682707

26692708
if (bucket == null) {
@@ -2672,6 +2711,11 @@ public void put(String key, int value) {
26722711
bucket[1] = key;
26732712
bucket[2] = new int[]{value};
26742713
fHashTable[hash] = bucket;
2714+
if (++fCount > fTableSize) {
2715+
// Rehash the table if the number of entries
2716+
// would exceed the number of buckets.
2717+
rehash();
2718+
}
26752719
} else {
26762720
int count = ((int[])bucket[0])[0];
26772721
int offset = 1 + 2*count;
@@ -2692,10 +2736,20 @@ public void put(String key, int value) {
26922736
}
26932737
j += 2;
26942738
}
2695-
if (! found) {
2739+
if (!found) {
26962740
bucket[offset++] = key;
26972741
bucket[offset]= new int[]{value};
26982742
((int[])bucket[0])[0] = ++count;
2743+
if (++fCount > fTableSize) {
2744+
// Rehash the table if the number of entries
2745+
// would exceed the number of buckets.
2746+
rehash();
2747+
}
2748+
else if (count > MAX_HASH_COLLISIONS) {
2749+
// Select a new hash function and rehash the table if
2750+
// MAX_HASH_COLLISIONS is exceeded.
2751+
rebalance();
2752+
}
26992753
}
27002754

27012755
}
@@ -2706,7 +2760,7 @@ public void put(String key, int value) {
27062760

27072761
/** Returns the value associated with the specified key tuple. */
27082762
public int get(String key) {
2709-
int hash = (key.hashCode() & 0x7FFFFFFF) % HASHTABLE_SIZE;
2763+
int hash = (hash(key) & 0x7FFFFFFF) % fTableSize;
27102764
Object[] bucket = fHashTable[hash];
27112765

27122766
if (bucket == null) {
@@ -2724,6 +2778,93 @@ public int get(String key) {
27242778
return -1;
27252779

27262780
} // get(int,String,String)
2781+
2782+
public int hash(String symbol) {
2783+
if (fHashMultipliers == null) {
2784+
return symbol.hashCode();
2785+
}
2786+
return hash0(symbol);
2787+
} // hash(String):int
2788+
2789+
private int hash0(String symbol) {
2790+
int code = 0;
2791+
final int length = symbol.length();
2792+
final int[] multipliers = fHashMultipliers;
2793+
for (int i = 0; i < length; ++i) {
2794+
code = code * multipliers[i & MULTIPLIERS_MASK] + symbol.charAt(i);
2795+
}
2796+
return code;
2797+
} // hash0(String):int
2798+
2799+
private void rehash() {
2800+
rehashCommon(fHashTable.length * 2 + 1);
2801+
} // rehash()
2802+
2803+
private void rebalance() {
2804+
if (fHashMultipliers == null) {
2805+
fHashMultipliers = new int[MULTIPLIERS_SIZE];
2806+
}
2807+
PrimeNumberSequenceGenerator.generateSequence(fHashMultipliers);
2808+
rehashCommon(fHashTable.length);
2809+
} // rebalance()
2810+
2811+
private void rehashCommon(final int newCapacity) {
2812+
2813+
final int oldCapacity = fHashTable.length;
2814+
final Object[][] oldTable = fHashTable;
2815+
2816+
final Object[][] newTable = new Object[newCapacity][];
2817+
2818+
fHashTable = newTable;
2819+
fTableSize = fHashTable.length;
2820+
2821+
for (int i = 0; i < oldCapacity; ++i) {
2822+
final Object[] oldBucket = oldTable[i];
2823+
if (oldBucket != null) {
2824+
final int oldCount = ((int[]) oldBucket[0])[0];
2825+
boolean oldBucketReused = false;
2826+
int k = 1;
2827+
for (int j = 0; j < oldCount; ++j) {
2828+
final String key = (String) oldBucket[k];
2829+
final Object value = oldBucket[k+1];
2830+
2831+
final int hash = (hash(key) & 0x7FFFFFFF) % fTableSize;
2832+
Object[] bucket = fHashTable[hash];
2833+
2834+
if (bucket == null) {
2835+
if (oldBucketReused) {
2836+
bucket = new Object[1 + 2*INITIAL_BUCKET_SIZE];
2837+
bucket[0] = new int[]{1};
2838+
}
2839+
else {
2840+
bucket = oldBucket;
2841+
((int[])bucket[0])[0] = 1;
2842+
oldBucketReused = true;
2843+
}
2844+
bucket[1] = key;
2845+
bucket[2] = value;
2846+
fHashTable[hash] = bucket;
2847+
}
2848+
else {
2849+
int count = ((int[])bucket[0])[0];
2850+
int offset = 1 + 2*count;
2851+
if (offset == bucket.length) {
2852+
int newSize = count + INITIAL_BUCKET_SIZE;
2853+
Object[] newBucket = new Object[1 + 2*newSize];
2854+
System.arraycopy(bucket, 0, newBucket, 0, offset);
2855+
bucket = newBucket;
2856+
fHashTable[hash] = bucket;
2857+
}
2858+
bucket[offset++] = key;
2859+
bucket[offset]= value;
2860+
((int[])bucket[0])[0] = ++count;
2861+
}
2862+
k += 2;
2863+
}
2864+
}
2865+
}
2866+
2867+
} // rehashCommon(int)
27272868

27282869
} // class QNameHashtable
27292870

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.xerces.util;
19+
20+
import java.util.Random;
21+
22+
/**
23+
* @version $Id$
24+
*/
25+
final class PrimeNumberSequenceGenerator {
26+
27+
private static int [] PRIMES = {
28+
3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
29+
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
30+
139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
31+
229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
32+
317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419,
33+
421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509,
34+
521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617,
35+
619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727};
36+
37+
static void generateSequence(int[] arrayToFill) {
38+
Random r = new Random();
39+
for (int i = 0; i < arrayToFill.length; ++i) {
40+
arrayToFill[i] = PRIMES[r.nextInt(PRIMES.length)];
41+
}
42+
}
43+
}

src/org/apache/xerces/util/SoftReferenceSymbolTable.java

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ public SoftReferenceSymbolTable() {
127127
public String addSymbol(String symbol) {
128128
clean();
129129
// search for identical symbol
130+
int collisionCount = 0;
130131
int bucket = hash(symbol) % fTableSize;
131132
for (SREntry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
132133
SREntryData data = (SREntryData)entry.get();
@@ -136,13 +137,20 @@ public String addSymbol(String symbol) {
136137
if (data.symbol.equals(symbol)) {
137138
return data.symbol;
138139
}
140+
++collisionCount;
139141
}
140142

141143
if (fCount >= fThreshold) {
142144
// Rehash the table if the threshold is exceeded
143145
rehash();
144146
bucket = hash(symbol) % fTableSize;
145-
}
147+
}
148+
else if (collisionCount >= fCollisionThreshold) {
149+
// Select a new hash function and rehash the table if
150+
// the collision threshold is exceeded.
151+
rebalance();
152+
bucket = hash(symbol) % fTableSize;
153+
}
146154

147155
// add new entry
148156
symbol = symbol.intern();
@@ -165,6 +173,7 @@ public String addSymbol(String symbol) {
165173
public String addSymbol(char[] buffer, int offset, int length) {
166174
clean();
167175
// search for identical symbol
176+
int collisionCount = 0;
168177
int bucket = hash(buffer, offset, length) % fTableSize;
169178
OUTER: for (SREntry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
170179
SREntryData data = (SREntryData)entry.get();
@@ -174,18 +183,26 @@ public String addSymbol(char[] buffer, int offset, int length) {
174183
if (length == data.characters.length) {
175184
for (int i = 0; i < length; i++) {
176185
if (buffer[offset + i] != data.characters[i]) {
186+
++collisionCount;
177187
continue OUTER;
178188
}
179189
}
180190
return data.symbol;
181191
}
192+
++collisionCount;
182193
}
183194

184195
if (fCount >= fThreshold) {
185196
// Rehash the table if the threshold is exceeded
186197
rehash();
187198
bucket = hash(buffer, offset, length) % fTableSize;
188-
}
199+
}
200+
else if (collisionCount >= fCollisionThreshold) {
201+
// Select a new hash function and rehash the table if
202+
// the collision threshold is exceeded.
203+
rebalance();
204+
bucket = hash(buffer, offset, length) % fTableSize;
205+
}
189206

190207
// add new entry
191208
String symbol = new String(buffer, offset, length).intern();
@@ -218,6 +235,20 @@ protected void compact() {
218235
rehashCommon(((int) (fCount / fLoadFactor)) * 2 + 1);
219236
}
220237

238+
/**
239+
* Randomly selects a new hash function and reorganizes this SymbolTable
240+
* in order to more evenly distribute its entries across the table. This
241+
* method is called automatically when the number keys in one of the
242+
* SymbolTable's buckets exceeds the given collision threshold.
243+
*/
244+
protected void rebalance() {
245+
if (fHashMultipliers == null) {
246+
fHashMultipliers = new int[MULTIPLIERS_SIZE];
247+
}
248+
PrimeNumberSequenceGenerator.generateSequence(fHashMultipliers);
249+
rehashCommon(fBuckets.length);
250+
}
251+
221252
private void rehashCommon(final int newCapacity) {
222253

223254
final int oldCapacity = fBuckets.length;
@@ -235,7 +266,7 @@ private void rehashCommon(final int newCapacity) {
235266

236267
SREntryData data = (SREntryData)e.get();
237268
if (data != null) {
238-
int index = hash(data.characters, 0, data.characters.length) % newCapacity;
269+
int index = hash(data.symbol) % newCapacity;
239270
if (newTable[index] != null) {
240271
newTable[index].prev = e;
241272
}

0 commit comments

Comments
 (0)