Improving ZipFile.getEntry performance using Bloom filters

Tue Apr 14 11:26:31 UTC 2020

Hi Eirik,

interesting idea and results.

I wonder if the overhead you remove here is mainly avoiding the need to
call byte[] bname = zc.getBytes(name); during lookup.

If we refactored ZipCoder to have a zc.getHash(name) method and
getEntryPos to decode to a byte[] only lazily, we ought to be able to
get most of the speed-up for the miss case, and none (or very little)
added cost on hits. And no added footprint for a bloom filter.

             int pos = res.zsrc.getEntryPos(zc, name, true);
...
         getEntryPos(ZipCoder zc, String name, boolean addSlash) {
             if (total == 0) {
                 return -1;
             }
             int hash = zc.getHash(name);
             int idx = table[(hsh & 0x7fffffff) % tablelen];
             byte[] bname = null;
             ...
                 while (idx != ZIP_ENDCHAIN) {
                     if (getEntryHash(idx) == hsh) {
                         if (bname == null) {
                             bname = zc.getBytes(name);
                         }
                         if (bname.length == ...

What do you think?

Thanks!

/Claes

On 2020-04-14 12:58, Eirik Bjørsnøs wrote:
> Here's a patch implementing the Bloom filter idea for JarFile.getEntry().
> 
> The patch yields a 63% time saving in ZipFile.getEntry() calls during
> Spring Petclinic startup, reducing total application startup time by ~5%.
> (Hits are 7% slower, misses 65% faster)
> 
> The current patch decodes the name to a String and as a temporary hack
> assumes UTF-8. This needs to be improved.
> 
> Would be great if there was a way to calculate a String.hashCode()
> equivalent looking at a byte array and its encoding without actually
> decoding it.
> 
> diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java
> b/src/java.base/share/classes/java/util/zip/ZipFile.java
> index 274e8788d1..754b1fc868 100644
> --- a/src/java.base/share/classes/java/util/zip/ZipFile.java
> +++ b/src/java.base/share/classes/java/util/zip/ZipFile.java
> @@ -40,6 +40,7 @@ import java.nio.file.Files;
>   import java.util.ArrayDeque;
>   import java.util.ArrayList;
>   import java.util.Arrays;
> +import java.util.BitSet;
>   import java.util.Collections;
>   import java.util.Deque;
>   import java.util.Enumeration;
> @@ -336,6 +337,9 @@ public class ZipFile implements ZipConstants, Closeable
> {
>           Objects.requireNonNull(name, "name");
>           synchronized (this) {
>               ensureOpen();
> +            if(!this.res.zsrc.bloomFilter.mightContain(name.hashCode())) {
> +                return null;
> +            }
>               byte[] bname = zc.getBytes(name);
>               int pos = res.zsrc.getEntryPos(bname, true);
>               if (pos != -1) {
> @@ -1118,6 +1122,8 @@ public class ZipFile implements ZipConstants,
> Closeable {
>           // referred by their index of their positions in the {@code
> entries}.
>           //
>           private int[] entries;                  // array of hashed cen
> entry
> +        private BloomFilter bloomFilter;
> +
>           private int addEntry(int index, int hash, int next, int pos) {
>               entries[index++] = hash;
>               entries[index++] = next;
> @@ -1422,6 +1428,7 @@ public class ZipFile implements ZipConstants,
> Closeable {
>               int hash = 0;
>               int next = -1;
> 
> +            bloomFilter = BloomFilter.create(total, 0.01);
>               // list for all meta entries
>               ArrayList<Integer> metanamesList = null;
> 
> @@ -1430,6 +1437,10 @@ public class ZipFile implements ZipConstants,
> Closeable {
>               int hsh = 0;
>               int pos = 0;
>               int limit = cen.length - ENDHDR;
> +
> +            // TODO: Use actual charset or get hash without decoding
> +            final ZipCoder zc = ZipCoder.get(UTF_8.INSTANCE);
> +
>               while (pos + CENHDR <= limit) {
>                   if (i >= total) {
>                       // This will only happen if the zip file has an
> incorrect
> @@ -1452,6 +1463,8 @@ public class ZipFile implements ZipConstants,
> Closeable {
>                       zerror("invalid CEN header (bad header size)");
>                   // Record the CEN offset and the name hash in our hash
> cell.
>                   hash = hashN(cen, pos + CENHDR, nlen);
> +                // TODO: Get String hash without decoding name?
> +                bloomFilter.add(getEntryName(cen, pos, zc).hashCode());
>                   hsh = (hash & 0x7fffffff) % tablelen;
>                   next = table[hsh];
>                   table[hsh] = idx;
> @@ -1478,6 +1491,16 @@ public class ZipFile implements ZipConstants,
> Closeable {
>               }
>           }
> 
> +        // Used to produce the String.hashCode() of the entry name
> +        private String getEntryName(byte[] cen, int pos, ZipCoder zc) {
> +            int nlen = CENNAM(cen, pos);
> +            if (!zc.isUTF8() && (CENFLG(cen, pos) & USE_UTF8) != 0) {
> +                return zc.toStringUTF8(cen, pos + CENHDR, nlen);
> +            } else {
> +                return zc.toString(cen, pos + CENHDR, nlen);
> +            }
> +        }
> +
>           private static void zerror(String msg) throws ZipException {
>               throw new ZipException(msg);
>           }
> @@ -1572,4 +1595,53 @@ public class ZipFile implements ZipConstants,
> Closeable {
>               return count;
>           }
>       }
> +
> +    /**
> +     * See "Less Hashing, Same Performance: Building a Better Bloom Filter"
> +     */
> +    private static class BloomFilter {
> +
> +        private final BitSet bits;
> +        private final int numBits;
> +        private final int numHashFunctions;
> +
> +        private BloomFilter(int numBits, int numHashFunctions) {
> +            this.bits = new BitSet(numBits);
> +            this.numBits = numBits;
> +            this.numHashFunctions = numHashFunctions;
> +        }
> +
> +        static BloomFilter create(int n, double p) {
> +            // See
> https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
> +            int numBits = (int) (-n * Math.log(Math.max(p,
> Double.MIN_VALUE)) / (Math.log(2) * Math.log(2)));
> +            int numHashFunctions =  Math.max(1, (int) Math.round((double)
> numBits / n * Math.log(2)));
> +            return new BloomFilter(numBits, numHashFunctions);
> +        }
> +
> +        void add(int inputHash) {
> +            for (int i = 1; i <= numHashFunctions; i++) {
> +                int hash = inputHash + i * inputHash;
> +
> +                if (hash < 0)
> +                    hash = ~hash;
> +
> +                bits.set(hash % numBits);
> +            }
> +        }
> +
> +        boolean mightContain(int inputHash) {
> +
> +            for (int i = 1; i <= numHashFunctions; i++) {
> +                int hash = inputHash + i * inputHash;
> +
> +                if (hash < 0)
> +                    hash = ~hash;
> +
> +                if(!bits.get(hash % numBits)) {
> +                    return false;
> +                }
> +            }
> +            return true;
> +        }
> +    }
>   }
> 
> 
> 
> On Mon, Apr 13, 2020 at 10:57 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
> 
>> On Mon, Apr 13, 2020 at 9:26 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
>>
>> This translates to a Petclinic startup improvement of 91 ms or 1.4 percent
>>> (assuming single-threaded class loading here).
>>>
>>> I expect that this can be improved further with a more clever use of hash
>>> functions. Specifically it would be great to skip earlier in case of a
>>> miss, before the String to byte[] encoding happens.
>>>
>>
>> By using the String name hash before encoding it to bytes, savings
>> increased to 270 ms, which represents a 4.6 percent performance improvement
>> on Spring Petclinic startup.
>>
>> Eirik.
>>