Improving ZipFile.getEntry performance using Bloom filters
Claes Redestad
claes.redestad at oracle.com
Tue Apr 14 11:26:31 UTC 2020
Hi Eirik,
interesting idea and results.
I wonder if the overhead you remove here is mainly avoiding the need to
call byte[] bname = zc.getBytes(name); during lookup.
If we refactored ZipCoder to have a zc.getHash(name) method and
getEntryPos to decode to a byte[] only lazily, we ought to be able to
get most of the speed-up for the miss case, and none (or very little)
added cost on hits. And no added footprint for a bloom filter.
int pos = res.zsrc.getEntryPos(zc, name, true);
...
getEntryPos(ZipCoder zc, String name, boolean addSlash) {
if (total == 0) {
return -1;
}
int hash = zc.getHash(name);
int idx = table[(hsh & 0x7fffffff) % tablelen];
byte[] bname = null;
...
while (idx != ZIP_ENDCHAIN) {
if (getEntryHash(idx) == hsh) {
if (bname == null) {
bname = zc.getBytes(name);
}
if (bname.length == ...
What do you think?
Thanks!
/Claes
On 2020-04-14 12:58, Eirik Bjørsnøs wrote:
> Here's a patch implementing the Bloom filter idea for JarFile.getEntry().
>
> The patch yields a 63% time saving in ZipFile.getEntry() calls during
> Spring Petclinic startup, reducing total application startup time by ~5%.
> (Hits are 7% slower, misses 65% faster)
>
> The current patch decodes the name to a String and as a temporary hack
> assumes UTF-8. This needs to be improved.
>
> Would be great if there was a way to calculate a String.hashCode()
> equivalent looking at a byte array and its encoding without actually
> decoding it.
>
> diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java
> b/src/java.base/share/classes/java/util/zip/ZipFile.java
> index 274e8788d1..754b1fc868 100644
> --- a/src/java.base/share/classes/java/util/zip/ZipFile.java
> +++ b/src/java.base/share/classes/java/util/zip/ZipFile.java
> @@ -40,6 +40,7 @@ import java.nio.file.Files;
> import java.util.ArrayDeque;
> import java.util.ArrayList;
> import java.util.Arrays;
> +import java.util.BitSet;
> import java.util.Collections;
> import java.util.Deque;
> import java.util.Enumeration;
> @@ -336,6 +337,9 @@ public class ZipFile implements ZipConstants, Closeable
> {
> Objects.requireNonNull(name, "name");
> synchronized (this) {
> ensureOpen();
> + if(!this.res.zsrc.bloomFilter.mightContain(name.hashCode())) {
> + return null;
> + }
> byte[] bname = zc.getBytes(name);
> int pos = res.zsrc.getEntryPos(bname, true);
> if (pos != -1) {
> @@ -1118,6 +1122,8 @@ public class ZipFile implements ZipConstants,
> Closeable {
> // referred by their index of their positions in the {@code
> entries}.
> //
> private int[] entries; // array of hashed cen
> entry
> + private BloomFilter bloomFilter;
> +
> private int addEntry(int index, int hash, int next, int pos) {
> entries[index++] = hash;
> entries[index++] = next;
> @@ -1422,6 +1428,7 @@ public class ZipFile implements ZipConstants,
> Closeable {
> int hash = 0;
> int next = -1;
>
> + bloomFilter = BloomFilter.create(total, 0.01);
> // list for all meta entries
> ArrayList<Integer> metanamesList = null;
>
> @@ -1430,6 +1437,10 @@ public class ZipFile implements ZipConstants,
> Closeable {
> int hsh = 0;
> int pos = 0;
> int limit = cen.length - ENDHDR;
> +
> + // TODO: Use actual charset or get hash without decoding
> + final ZipCoder zc = ZipCoder.get(UTF_8.INSTANCE);
> +
> while (pos + CENHDR <= limit) {
> if (i >= total) {
> // This will only happen if the zip file has an
> incorrect
> @@ -1452,6 +1463,8 @@ public class ZipFile implements ZipConstants,
> Closeable {
> zerror("invalid CEN header (bad header size)");
> // Record the CEN offset and the name hash in our hash
> cell.
> hash = hashN(cen, pos + CENHDR, nlen);
> + // TODO: Get String hash without decoding name?
> + bloomFilter.add(getEntryName(cen, pos, zc).hashCode());
> hsh = (hash & 0x7fffffff) % tablelen;
> next = table[hsh];
> table[hsh] = idx;
> @@ -1478,6 +1491,16 @@ public class ZipFile implements ZipConstants,
> Closeable {
> }
> }
>
> + // Used to produce the String.hashCode() of the entry name
> + private String getEntryName(byte[] cen, int pos, ZipCoder zc) {
> + int nlen = CENNAM(cen, pos);
> + if (!zc.isUTF8() && (CENFLG(cen, pos) & USE_UTF8) != 0) {
> + return zc.toStringUTF8(cen, pos + CENHDR, nlen);
> + } else {
> + return zc.toString(cen, pos + CENHDR, nlen);
> + }
> + }
> +
> private static void zerror(String msg) throws ZipException {
> throw new ZipException(msg);
> }
> @@ -1572,4 +1595,53 @@ public class ZipFile implements ZipConstants,
> Closeable {
> return count;
> }
> }
> +
> + /**
> + * See "Less Hashing, Same Performance: Building a Better Bloom Filter"
> + */
> + private static class BloomFilter {
> +
> + private final BitSet bits;
> + private final int numBits;
> + private final int numHashFunctions;
> +
> + private BloomFilter(int numBits, int numHashFunctions) {
> + this.bits = new BitSet(numBits);
> + this.numBits = numBits;
> + this.numHashFunctions = numHashFunctions;
> + }
> +
> + static BloomFilter create(int n, double p) {
> + // See
> https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
> + int numBits = (int) (-n * Math.log(Math.max(p,
> Double.MIN_VALUE)) / (Math.log(2) * Math.log(2)));
> + int numHashFunctions = Math.max(1, (int) Math.round((double)
> numBits / n * Math.log(2)));
> + return new BloomFilter(numBits, numHashFunctions);
> + }
> +
> + void add(int inputHash) {
> + for (int i = 1; i <= numHashFunctions; i++) {
> + int hash = inputHash + i * inputHash;
> +
> + if (hash < 0)
> + hash = ~hash;
> +
> + bits.set(hash % numBits);
> + }
> + }
> +
> + boolean mightContain(int inputHash) {
> +
> + for (int i = 1; i <= numHashFunctions; i++) {
> + int hash = inputHash + i * inputHash;
> +
> + if (hash < 0)
> + hash = ~hash;
> +
> + if(!bits.get(hash % numBits)) {
> + return false;
> + }
> + }
> + return true;
> + }
> + }
> }
>
>
>
> On Mon, Apr 13, 2020 at 10:57 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
>
>> On Mon, Apr 13, 2020 at 9:26 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
>>
>> This translates to a Petclinic startup improvement of 91 ms or 1.4 percent
>>> (assuming single-threaded class loading here).
>>>
>>> I expect that this can be improved further with a more clever use of hash
>>> functions. Specifically it would be great to skip earlier in case of a
>>> miss, before the String to byte[] encoding happens.
>>>
>>
>> By using the String name hash before encoding it to bytes, savings
>> increased to 270 ms, which represents a 4.6 percent performance improvement
>> on Spring Petclinic startup.
>>
>> Eirik.
>>
More information about the core-libs-dev
mailing list