Improving ZipFile.getEntryPos double loop queries
Eirik Bjørsnøs
eirbjo at gmail.com
Thu Apr 16 11:47:58 UTC 2020
On Thu, Apr 16, 2020 at 12:59 PM Claes Redestad <claes.redestad at oracle.com>
wrote:
>
> I think this is an interesting idea and a good optimization.
Looks like we get most of the performance win of Bloom filters while not
introducing regressions for hits, footprint or complexity.
We'll
> deliberately cause a few more hash collisions,
Will ZIP files commonly include entries for both "META-INF" and "META-INF/"?
Some issues:
>
> - Need to be made to work on empty strings (hashN([], 0, [].length) ->
> a[off + len - 1] -> a[-1] -> AIOOBE)
>
Oops. Should be safe to check for len > 0, right?
if(len > 0 && a[off + len -1] == '/')
len--;
> - If name actually ends with a / we shouldn't look for name + /,
> e.g.:
>
> (addSlash && name.length == nlen - 1 && name.length > 0 &&
> name[name.length - 1] != '/' && cen[pos + CENHDR + nlen - 1] == '/')
>
Fixed and added some comments for readability.
Here's an updated patch:
diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java
b/src/java.base/share/classes/java/util/zip/ZipFile.java
index 274e8788d1..ba5101e0db 100644
--- a/src/java.base/share/classes/java/util/zip/ZipFile.java
+++ b/src/java.base/share/classes/java/util/zip/ZipFile.java
@@ -1266,6 +1266,11 @@ public class ZipFile implements ZipConstants,
Closeable {
}
private static final int hashN(byte[] a, int off, int len) {
+ // Performance optimization: getEntryPos assumes that the hash
+ // of a name remains unchanged when appending a trailing '/'.
+ if(len > 0 && a[off + len -1] == '/')
+ len--;
+
int h = 1;
while (len-- > 0) {
h = 31 * h + a[off++];
@@ -1273,10 +1278,6 @@ public class ZipFile implements ZipConstants,
Closeable {
return h;
}
- private static final int hash_append(int hash, byte b) {
- return hash * 31 + b;
- }
-
private static class End {
int centot; // 4 bytes
long cenlen; // 4 bytes
@@ -1493,48 +1494,36 @@ public class ZipFile implements ZipConstants,
Closeable {
int hsh = hashN(name, 0, name.length);
int idx = table[(hsh & 0x7fffffff) % tablelen];
/*
- * This while loop is an optimization where a double lookup
- * for name and name+/ is being performed. The name char
- * array has enough room at the end to try again with a
- * slash appended if the first table lookup does not succeed.
+ * Search down the target hash chain for a entry whose
+ * 32 bit hash matches the hashed name.
*/
- while (true) {
- /*
- * Search down the target hash chain for a entry whose
- * 32 bit hash matches the hashed name.
- */
- while (idx != ZIP_ENDCHAIN) {
- if (getEntryHash(idx) == hsh) {
- // The CEN name must match the specfied one
- int pos = getEntryPos(idx);
- if (name.length == CENNAM(cen, pos)) {
- boolean matched = true;
- int nameoff = pos + CENHDR;
- for (int i = 0; i < name.length; i++) {
- if (name[i] != cen[nameoff++]) {
- matched = false;
- break;
- }
- }
- if (matched) {
- return pos;
+ while (idx != ZIP_ENDCHAIN) {
+ if (getEntryHash(idx) == hsh) {
+ // The CEN name must match the specfied one
+ int pos = getEntryPos(idx);
+ final int nlen = CENNAM(cen, pos);
+ final int nstart = pos + CENHDR;
+ final boolean allowSlash = addSlash // we should
match names with a trailing '/'
+ && nlen == name.length + 1 // nlen has
one extra byte
+ && cen[nstart + nlen - 1] == '/' // that byte
is a '/'
+ && name[name.length - 1] != '/'; // the name
does not already end with '/'
+ if (name.length == nlen || allowSlash) {
+ boolean matched = true;
+ int nameoff = pos + CENHDR;
+ for (int i = 0; i < name.length; i++) {
+ if (name[i] != cen[nameoff++]) {
+ matched = false;
+ break;
}
- }
+ }
+ if (matched) {
+ return pos;
+ }
}
- idx = getEntryNext(idx);
- }
- /* If not addSlash, or slash is already there, we are done
*/
- if (!addSlash || name.length == 0 || name[name.length -
1] == '/') {
- return -1;
}
- /* Add slash and try once more */
- name = Arrays.copyOf(name, name.length + 1);
- name[name.length - 1] = '/';
- hsh = hash_append(hsh, (byte)'/');
- //idx = table[hsh % tablelen];
- idx = table[(hsh & 0x7fffffff) % tablelen];
- addSlash = false;
+ idx = getEntryNext(idx);
}
+ return -1;
}
/**
More information about the core-libs-dev
mailing list