Here's a patch implementing the Bloom filter idea for JarFile.getEntry(). The patch yields a 63% time saving in ZipFile.getEntry() calls during Spring Petclinic startup, reducing total application startup time by ~5%. (Hits are 7% slower, misses 65% faster)
The current patch decodes the name to a String and as a temporary hack assumes UTF-8. This needs to be improved. Would be great if there was a way to calculate a String.hashCode() equivalent looking at a byte array and its encoding without actually decoding it. diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java b/src/java.base/share/classes/java/util/zip/ZipFile.java index 274e8788d1..754b1fc868 100644 --- a/src/java.base/share/classes/java/util/zip/ZipFile.java +++ b/src/java.base/share/classes/java/util/zip/ZipFile.java @@ -40,6 +40,7 @@ import java.nio.file.Files; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; import java.util.Deque; import java.util.Enumeration; @@ -336,6 +337,9 @@ public class ZipFile implements ZipConstants, Closeable { Objects.requireNonNull(name, "name"); synchronized (this) { ensureOpen(); + if(!this.res.zsrc.bloomFilter.mightContain(name.hashCode())) { + return null; + } byte[] bname = zc.getBytes(name); int pos = res.zsrc.getEntryPos(bname, true); if (pos != -1) { @@ -1118,6 +1122,8 @@ public class ZipFile implements ZipConstants, Closeable { // referred by their index of their positions in the {@code entries}. // private int[] entries; // array of hashed cen entry + private BloomFilter bloomFilter; + private int addEntry(int index, int hash, int next, int pos) { entries[index++] = hash; entries[index++] = next; @@ -1422,6 +1428,7 @@ public class ZipFile implements ZipConstants, Closeable { int hash = 0; int next = -1; + bloomFilter = BloomFilter.create(total, 0.01); // list for all meta entries ArrayList<Integer> metanamesList = null; @@ -1430,6 +1437,10 @@ public class ZipFile implements ZipConstants, Closeable { int hsh = 0; int pos = 0; int limit = cen.length - ENDHDR; + + // TODO: Use actual charset or get hash without decoding + final ZipCoder zc = ZipCoder.get(UTF_8.INSTANCE); + while (pos + CENHDR <= limit) { if (i >= total) { // This will only happen if the zip file has an incorrect @@ -1452,6 +1463,8 @@ public class ZipFile implements ZipConstants, Closeable { zerror("invalid CEN header (bad header size)"); // Record the CEN offset and the name hash in our hash cell. hash = hashN(cen, pos + CENHDR, nlen); + // TODO: Get String hash without decoding name? + bloomFilter.add(getEntryName(cen, pos, zc).hashCode()); hsh = (hash & 0x7fffffff) % tablelen; next = table[hsh]; table[hsh] = idx; @@ -1478,6 +1491,16 @@ public class ZipFile implements ZipConstants, Closeable { } } + // Used to produce the String.hashCode() of the entry name + private String getEntryName(byte[] cen, int pos, ZipCoder zc) { + int nlen = CENNAM(cen, pos); + if (!zc.isUTF8() && (CENFLG(cen, pos) & USE_UTF8) != 0) { + return zc.toStringUTF8(cen, pos + CENHDR, nlen); + } else { + return zc.toString(cen, pos + CENHDR, nlen); + } + } + private static void zerror(String msg) throws ZipException { throw new ZipException(msg); } @@ -1572,4 +1595,53 @@ public class ZipFile implements ZipConstants, Closeable { return count; } } + + /** + * See "Less Hashing, Same Performance: Building a Better Bloom Filter" + */ + private static class BloomFilter { + + private final BitSet bits; + private final int numBits; + private final int numHashFunctions; + + private BloomFilter(int numBits, int numHashFunctions) { + this.bits = new BitSet(numBits); + this.numBits = numBits; + this.numHashFunctions = numHashFunctions; + } + + static BloomFilter create(int n, double p) { + // See https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + int numBits = (int) (-n * Math.log(Math.max(p, Double.MIN_VALUE)) / (Math.log(2) * Math.log(2))); + int numHashFunctions = Math.max(1, (int) Math.round((double) numBits / n * Math.log(2))); + return new BloomFilter(numBits, numHashFunctions); + } + + void add(int inputHash) { + for (int i = 1; i <= numHashFunctions; i++) { + int hash = inputHash + i * inputHash; + + if (hash < 0) + hash = ~hash; + + bits.set(hash % numBits); + } + } + + boolean mightContain(int inputHash) { + + for (int i = 1; i <= numHashFunctions; i++) { + int hash = inputHash + i * inputHash; + + if (hash < 0) + hash = ~hash; + + if(!bits.get(hash % numBits)) { + return false; + } + } + return true; + } + } } On Mon, Apr 13, 2020 at 10:57 PM Eirik Bjørsnøs <eir...@gmail.com> wrote: > On Mon, Apr 13, 2020 at 9:26 PM Eirik Bjørsnøs <eir...@gmail.com> wrote: > > This translates to a Petclinic startup improvement of 91 ms or 1.4 percent >> (assuming single-threaded class loading here). >> >> I expect that this can be improved further with a more clever use of hash >> functions. Specifically it would be great to skip earlier in case of a >> miss, before the String to byte[] encoding happens. >> > > By using the String name hash before encoding it to bytes, savings > increased to 270 ms, which represents a 4.6 percent performance improvement > on Spring Petclinic startup. > > Eirik. >