Improving ZipFile.getEntry performance using Bloom filters
Eirik Bjørsnøs
eirbjo at gmail.com
Tue Apr 14 10:58:20 UTC 2020
Here's a patch implementing the Bloom filter idea for JarFile.getEntry().
The patch yields a 63% time saving in ZipFile.getEntry() calls during
Spring Petclinic startup, reducing total application startup time by ~5%.
(Hits are 7% slower, misses 65% faster)
The current patch decodes the name to a String and as a temporary hack
assumes UTF-8. This needs to be improved.
Would be great if there was a way to calculate a String.hashCode()
equivalent looking at a byte array and its encoding without actually
decoding it.
diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java
b/src/java.base/share/classes/java/util/zip/ZipFile.java
index 274e8788d1..754b1fc868 100644
--- a/src/java.base/share/classes/java/util/zip/ZipFile.java
+++ b/src/java.base/share/classes/java/util/zip/ZipFile.java
@@ -40,6 +40,7 @@ import java.nio.file.Files;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.BitSet;
import java.util.Collections;
import java.util.Deque;
import java.util.Enumeration;
@@ -336,6 +337,9 @@ public class ZipFile implements ZipConstants, Closeable
{
Objects.requireNonNull(name, "name");
synchronized (this) {
ensureOpen();
+ if(!this.res.zsrc.bloomFilter.mightContain(name.hashCode())) {
+ return null;
+ }
byte[] bname = zc.getBytes(name);
int pos = res.zsrc.getEntryPos(bname, true);
if (pos != -1) {
@@ -1118,6 +1122,8 @@ public class ZipFile implements ZipConstants,
Closeable {
// referred by their index of their positions in the {@code
entries}.
//
private int[] entries; // array of hashed cen
entry
+ private BloomFilter bloomFilter;
+
private int addEntry(int index, int hash, int next, int pos) {
entries[index++] = hash;
entries[index++] = next;
@@ -1422,6 +1428,7 @@ public class ZipFile implements ZipConstants,
Closeable {
int hash = 0;
int next = -1;
+ bloomFilter = BloomFilter.create(total, 0.01);
// list for all meta entries
ArrayList<Integer> metanamesList = null;
@@ -1430,6 +1437,10 @@ public class ZipFile implements ZipConstants,
Closeable {
int hsh = 0;
int pos = 0;
int limit = cen.length - ENDHDR;
+
+ // TODO: Use actual charset or get hash without decoding
+ final ZipCoder zc = ZipCoder.get(UTF_8.INSTANCE);
+
while (pos + CENHDR <= limit) {
if (i >= total) {
// This will only happen if the zip file has an
incorrect
@@ -1452,6 +1463,8 @@ public class ZipFile implements ZipConstants,
Closeable {
zerror("invalid CEN header (bad header size)");
// Record the CEN offset and the name hash in our hash
cell.
hash = hashN(cen, pos + CENHDR, nlen);
+ // TODO: Get String hash without decoding name?
+ bloomFilter.add(getEntryName(cen, pos, zc).hashCode());
hsh = (hash & 0x7fffffff) % tablelen;
next = table[hsh];
table[hsh] = idx;
@@ -1478,6 +1491,16 @@ public class ZipFile implements ZipConstants,
Closeable {
}
}
+ // Used to produce the String.hashCode() of the entry name
+ private String getEntryName(byte[] cen, int pos, ZipCoder zc) {
+ int nlen = CENNAM(cen, pos);
+ if (!zc.isUTF8() && (CENFLG(cen, pos) & USE_UTF8) != 0) {
+ return zc.toStringUTF8(cen, pos + CENHDR, nlen);
+ } else {
+ return zc.toString(cen, pos + CENHDR, nlen);
+ }
+ }
+
private static void zerror(String msg) throws ZipException {
throw new ZipException(msg);
}
@@ -1572,4 +1595,53 @@ public class ZipFile implements ZipConstants,
Closeable {
return count;
}
}
+
+ /**
+ * See "Less Hashing, Same Performance: Building a Better Bloom Filter"
+ */
+ private static class BloomFilter {
+
+ private final BitSet bits;
+ private final int numBits;
+ private final int numHashFunctions;
+
+ private BloomFilter(int numBits, int numHashFunctions) {
+ this.bits = new BitSet(numBits);
+ this.numBits = numBits;
+ this.numHashFunctions = numHashFunctions;
+ }
+
+ static BloomFilter create(int n, double p) {
+ // See
https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
+ int numBits = (int) (-n * Math.log(Math.max(p,
Double.MIN_VALUE)) / (Math.log(2) * Math.log(2)));
+ int numHashFunctions = Math.max(1, (int) Math.round((double)
numBits / n * Math.log(2)));
+ return new BloomFilter(numBits, numHashFunctions);
+ }
+
+ void add(int inputHash) {
+ for (int i = 1; i <= numHashFunctions; i++) {
+ int hash = inputHash + i * inputHash;
+
+ if (hash < 0)
+ hash = ~hash;
+
+ bits.set(hash % numBits);
+ }
+ }
+
+ boolean mightContain(int inputHash) {
+
+ for (int i = 1; i <= numHashFunctions; i++) {
+ int hash = inputHash + i * inputHash;
+
+ if (hash < 0)
+ hash = ~hash;
+
+ if(!bits.get(hash % numBits)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
}
On Mon, Apr 13, 2020 at 10:57 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
> On Mon, Apr 13, 2020 at 9:26 PM Eirik Bjørsnøs <eirbjo at gmail.com> wrote:
>
> This translates to a Petclinic startup improvement of 91 ms or 1.4 percent
>> (assuming single-threaded class loading here).
>>
>> I expect that this can be improved further with a more clever use of hash
>> functions. Specifically it would be great to skip earlier in case of a
>> miss, before the String to byte[] encoding happens.
>>
>
> By using the String name hash before encoding it to bytes, savings
> increased to 270 ms, which represents a 4.6 percent performance improvement
> on Spring Petclinic startup.
>
> Eirik.
>
More information about the core-libs-dev
mailing list