Improving ZipFile.getEntryPos double loop queries

Eirik Bjørsnøs eirbjo at gmail.com
Thu Apr 16 11:47:58 UTC 2020


On Thu, Apr 16, 2020 at 12:59 PM Claes Redestad <claes.redestad at oracle.com>
wrote:

>
> I think this is an interesting idea and a good optimization.


Looks like we get most of the performance win of Bloom filters while not
introducing regressions for hits, footprint or complexity.

We'll
> deliberately cause a few more hash collisions,


Will ZIP files commonly include entries for both "META-INF" and "META-INF/"?

Some issues:
>
> - Need to be made to work on empty strings (hashN([], 0, [].length) ->
> a[off + len - 1] -> a[-1] -> AIOOBE)
>

Oops. Should be safe to check for len > 0, right?

    if(len > 0 && a[off + len -1] == '/')
        len--;


> - If name actually ends with a / we shouldn't look for name + /,
> e.g.:
>
>     (addSlash && name.length == nlen - 1 && name.length > 0 &&
> name[name.length - 1] != '/' && cen[pos + CENHDR + nlen - 1] == '/')
>

Fixed and added some comments for readability.

Here's an updated patch:

diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java
b/src/java.base/share/classes/java/util/zip/ZipFile.java
index 274e8788d1..ba5101e0db 100644
--- a/src/java.base/share/classes/java/util/zip/ZipFile.java
+++ b/src/java.base/share/classes/java/util/zip/ZipFile.java
@@ -1266,6 +1266,11 @@ public class ZipFile implements ZipConstants,
Closeable {
         }

         private static final int hashN(byte[] a, int off, int len) {
+            // Performance optimization: getEntryPos assumes that the hash
+            // of a name remains unchanged when appending a trailing  '/'.
+            if(len > 0 && a[off + len -1] == '/')
+                len--;
+
             int h = 1;
             while (len-- > 0) {
                 h = 31 * h + a[off++];
@@ -1273,10 +1278,6 @@ public class ZipFile implements ZipConstants,
Closeable {
             return h;
         }

-        private static final int hash_append(int hash, byte b) {
-            return hash * 31 + b;
-        }
-
         private static class End {
             int  centot;     // 4 bytes
             long cenlen;     // 4 bytes
@@ -1493,48 +1494,36 @@ public class ZipFile implements ZipConstants,
Closeable {
             int hsh = hashN(name, 0, name.length);
             int idx = table[(hsh & 0x7fffffff) % tablelen];
             /*
-             * This while loop is an optimization where a double lookup
-             * for name and name+/ is being performed. The name char
-             * array has enough room at the end to try again with a
-             * slash appended if the first table lookup does not succeed.
+             * Search down the target hash chain for a entry whose
+             * 32 bit hash matches the hashed name.
              */
-            while (true) {
-                /*
-                 * Search down the target hash chain for a entry whose
-                 * 32 bit hash matches the hashed name.
-                 */
-                while (idx != ZIP_ENDCHAIN) {
-                    if (getEntryHash(idx) == hsh) {
-                        // The CEN name must match the specfied one
-                        int pos = getEntryPos(idx);
-                        if (name.length == CENNAM(cen, pos)) {
-                            boolean matched = true;
-                            int nameoff = pos + CENHDR;
-                            for (int i = 0; i < name.length; i++) {
-                                if (name[i] != cen[nameoff++]) {
-                                    matched = false;
-                                    break;
-                                }
-                            }
-                            if (matched) {
-                                return pos;
+            while (idx != ZIP_ENDCHAIN) {
+                if (getEntryHash(idx) == hsh) {
+                    // The CEN name must match the specfied one
+                    int pos = getEntryPos(idx);
+                    final int nlen = CENNAM(cen, pos);
+                    final int nstart = pos + CENHDR;
+                    final boolean allowSlash = addSlash       // we should
match names with a trailing '/'
+                            && nlen == name.length + 1        // nlen has
one extra byte
+                            && cen[nstart + nlen - 1] == '/'  // that byte
is a '/'
+                            && name[name.length - 1] != '/';  // the name
does not already end with '/'
+                    if (name.length == nlen || allowSlash) {
+                        boolean matched = true;
+                        int nameoff = pos + CENHDR;
+                        for (int i = 0; i < name.length; i++) {
+                            if (name[i] != cen[nameoff++]) {
+                                matched = false;
+                                break;
                             }
-                         }
+                        }
+                        if (matched) {
+                            return pos;
+                        }
                     }
-                    idx = getEntryNext(idx);
-                }
-                /* If not addSlash, or slash is already there, we are done
*/
-                if (!addSlash  || name.length == 0 || name[name.length -
1] == '/') {
-                     return -1;
                 }
-                /* Add slash and try once more */
-                name = Arrays.copyOf(name, name.length + 1);
-                name[name.length - 1] = '/';
-                hsh = hash_append(hsh, (byte)'/');
-                //idx = table[hsh % tablelen];
-                idx = table[(hsh & 0x7fffffff) % tablelen];
-                addSlash = false;
+                idx = getEntryNext(idx);
             }
+            return -1;
         }

         /**


More information about the core-libs-dev mailing list