Improving ZipFile.getEntry performance using Bloom filters

Tue Apr 14 10:58:20 UTC 2020

Here's a patch implementing the Bloom filter idea for JarFile.getEntry().

The patch yields a 63% time saving in ZipFile.getEntry() calls during
Spring Petclinic startup, reducing total application startup time by ~5%.
(Hits are 7% slower, misses 65% faster)

The current patch decodes the name to a String and as a temporary hack
assumes UTF-8. This needs to be improved.

Would be great if there was a way to calculate a String.hashCode()
equivalent looking at a byte array and its encoding without actually
decoding it.

diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java
b/src/java.base/share/classes/java/util/zip/ZipFile.java
index 274e8788d1..754b1fc868 100644
--- a/src/java.base/share/classes/java/util/zip/ZipFile.java
+++ b/src/java.base/share/classes/java/util/zip/ZipFile.java
@@ -40,6 +40,7 @@ import java.nio.file.Files;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.BitSet;
 import java.util.Collections;
 import java.util.Deque;
 import java.util.Enumeration;
@@ -336,6 +337,9 @@ public class ZipFile implements ZipConstants, Closeable
{
         Objects.requireNonNull(name, "name");
         synchronized (this) {
             ensureOpen();
+            if(!this.res.zsrc.bloomFilter.mightContain(name.hashCode())) {
+                return null;
+            }
             byte[] bname = zc.getBytes(name);
             int pos = res.zsrc.getEntryPos(bname, true);
             if (pos != -1) {
@@ -1118,6 +1122,8 @@ public class ZipFile implements ZipConstants,
Closeable {
         // referred by their index of their positions in the {@code
entries}.
         //
         private int[] entries;                  // array of hashed cen
entry
+        private BloomFilter bloomFilter;
+
         private int addEntry(int index, int hash, int next, int pos) {
             entries[index++] = hash;
             entries[index++] = next;
@@ -1422,6 +1428,7 @@ public class ZipFile implements ZipConstants,
Closeable {
             int hash = 0;
             int next = -1;

+            bloomFilter = BloomFilter.create(total, 0.01);
             // list for all meta entries
             ArrayList<Integer> metanamesList = null;

@@ -1430,6 +1437,10 @@ public class ZipFile implements ZipConstants,
Closeable {
             int hsh = 0;
             int pos = 0;
             int limit = cen.length - ENDHDR;
+
+            // TODO: Use actual charset or get hash without decoding
+            final ZipCoder zc = ZipCoder.get(UTF_8.INSTANCE);
+
             while (pos + CENHDR <= limit) {
                 if (i >= total) {
                     // This will only happen if the zip file has an
incorrect
@@ -1452,6 +1463,8 @@ public class ZipFile implements ZipConstants,
Closeable {
                     zerror("invalid CEN header (bad header size)");
                 // Record the CEN offset and the name hash in our hash
cell.
                 hash = hashN(cen, pos + CENHDR, nlen);
+                // TODO: Get String hash without decoding name?
+                bloomFilter.add(getEntryName(cen, pos, zc).hashCode());
                 hsh = (hash & 0x7fffffff) % tablelen;
                 next = table[hsh];
                 table[hsh] = idx;
@@ -1478,6 +1491,16 @@ public class ZipFile implements ZipConstants,
Closeable {
             }
         }

+        // Used to produce the String.hashCode() of the entry name
+        private String getEntryName(byte[] cen, int pos, ZipCoder zc) {
+            int nlen = CENNAM(cen, pos);
+            if (!zc.isUTF8() && (CENFLG(cen, pos) & USE_UTF8) != 0) {
+                return zc.toStringUTF8(cen, pos + CENHDR, nlen);
+            } else {
+                return zc.toString(cen, pos + CENHDR, nlen);
+            }
+        }
+
         private static void zerror(String msg) throws ZipException {
             throw new ZipException(msg);
         }
@@ -1572,4 +1595,53 @@ public class ZipFile implements ZipConstants,
Closeable {
             return count;
         }
     }
+
+    /**
+     * See "Less Hashing, Same Performance: Building a Better Bloom Filter"
+     */
+    private static class BloomFilter {
+
+        private final BitSet bits;
+        private final int numBits;
+        private final int numHashFunctions;
+
+        private BloomFilter(int numBits, int numHashFunctions) {
+            this.bits = new BitSet(numBits);
+            this.numBits = numBits;
+            this.numHashFunctions = numHashFunctions;
+        }
+
+        static BloomFilter create(int n, double p) {
+            // See
https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
+            int numBits = (int) (-n * Math.log(Math.max(p,
Double.MIN_VALUE)) / (Math.log(2) * Math.log(2)));
+            int numHashFunctions =  Math.max(1, (int) Math.round((double)
numBits / n * Math.log(2)));
+            return new BloomFilter(numBits, numHashFunctions);
+        }
+
+        void add(int inputHash) {
+            for (int i = 1; i <= numHashFunctions; i++) {
+                int hash = inputHash + i * inputHash;
+
+                if (hash < 0)
+                    hash = ~hash;
+
+                bits.set(hash % numBits);
+            }
+        }
+
+        boolean mightContain(int inputHash) {
+
+            for (int i = 1; i <= numHashFunctions; i++) {
+                int hash = inputHash + i * inputHash;
+
+                if (hash < 0)
+                    hash = ~hash;
+
+                if(!bits.get(hash % numBits)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
 }



On Mon, Apr 13, 2020 at 10:57 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:

> On Mon, Apr 13, 2020 at 9:26 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
>
> This translates to a Petclinic startup improvement of 91 ms or 1.4 percent
>> (assuming single-threaded class loading here).
>>
>> I expect that this can be improved further with a more clever use of hash
>> functions. Specifically it would be great to skip earlier in case of a
>> miss, before the String to byte[] encoding happens.
>>
>
> By using the String name hash before encoding it to bytes, savings
> increased to 270 ms, which represents a 4.6 percent performance improvement
> on Spring Petclinic startup.
>
> Eirik.
>