RFR: 8322256: Define and document GZIPInputStream concatenated stream semantics [v5]

Mon Jul 15 13:14:56 UTC 2024

On Mon, 15 Jul 2024 10:06:20 GMT, Jaikiran Pai <jpai at openjdk.org> wrote:

>> Archie Cobbs has updated the pull request with a new target base due to a merge or a rebase. The pull request now contains 12 commits:
>> 
>>  - Bump @since from 23 to 24.
>>  - Merge branch 'master' into JDK-8322256
>>  - Relabel "trailing garbage" as "extra bytes" to sound less accusatory.
>>  - Simplify code by eliminating an impossible case.
>>  - Field name change and Javadoc wording tweaks.
>>  - Merge branch 'master' into JDK-8322256
>>  - Javadoc wording tweaks.
>>  - Merge branch 'master' into JDK-8322256
>>  - Clarify exceptions: sometimes ZipException, sometimes EOFException.
>>  - Merge branch 'master' into JDK-8322256
>>  - ... and 2 more: https://git.openjdk.org/jdk/compare/75dc2f85...f845a75b
>
> src/java.base/share/classes/java/util/zip/GZIPInputStream.java line 153:
> 
>> 151:      */
>> 152:     public GZIPInputStream(InputStream in, int size,
>> 153:             boolean allowConcatenation, boolean ignoreExtraBytes) throws IOException {
> 
> I haven't reviewed the javadoc changes. I will do that separately once we settle down on the API and implementation changes.
> As for this new constructor, I think the only new parameter we should introduce is whether or not the underlying input stream `in` is expected/allowed to have concatenated GZIP stream(s). So I think we should remove the "ignoreExtraBytes" new parameters from this constructor and. Additionally, I think the `allowConcatenation` should instead be named `allowConcatenatedGZIPStream`:
> 
> 
> public GZIPInputStream(InputStream in, int size, boolean allowConcatenatedGZIPStream) throws IOException {

Just to provide a more concrete input, here's a very minimal tested version of what I had in mind:

--- a/src/java.base/share/classes/java/util/zip/GZIPInputStream.java
+++ b/src/java.base/share/classes/java/util/zip/GZIPInputStream.java
@@ -55,6 +55,8 @@ public class GZIPInputStream extends InflaterInputStream {
 
     private boolean closed = false;
 
+    private final boolean allowConcatenatedGZIPStream;
+
     /**
      * Check to make sure that this stream has not been closed
      */
@@ -76,14 +78,7 @@ private void ensureOpen() throws IOException {
      * @throws    IllegalArgumentException if {@code size <= 0}
      */
     public GZIPInputStream(InputStream in, int size) throws IOException {
-        super(in, createInflater(in, size), size);
-        usesDefaultInflater = true;
-        try {
-            readHeader(in);
-        } catch (IOException ioe) {
-            this.inf.end();
-            throw ioe;
-        }
+        this(in, size, true);
     }
 
     /*
@@ -111,7 +106,28 @@ private static Inflater createInflater(InputStream in, int size) {
      * @throws    IOException if an I/O error has occurred
      */
     public GZIPInputStream(InputStream in) throws IOException {
-        this(in, 512);
+        this(in, 512, true);
+    }
+
+    /**
+     * WIP
+     * @param in the input stream
+     * @param size the input buffer size
+     * @param allowConcatenatedGZIPStream true if the input stream is allowed to contain
+     *                                    concatenated GZIP streams. false otherwise
+     * @throws IOException if an I/O error has occurred
+     */
+    public GZIPInputStream(InputStream in, int size, boolean allowConcatenatedGZIPStream)
+            throws IOException {
+        super(in, createInflater(in, size), size);
+        this.allowConcatenatedGZIPStream = allowConcatenatedGZIPStream;
+        usesDefaultInflater = true;
+        try {
+            readHeader(in);
+        } catch (IOException ioe) {
+            this.inf.end();
+            throw ioe;
+        }
     }
 
     /**
@@ -150,10 +166,45 @@ public int read(byte[] buf, int off, int len) throws IOException {
         }
         int n = super.read(buf, off, len);
         if (n == -1) {
-            if (readTrailer())
+            // reading of deflated data is now complete, we now read the GZIP stream trailer
+            readTrailer();
+            if (!allowConcatenatedGZIPStream) {
                 eos = true;
-            else
+            } else {
+                // This GZIPInputStream instance was created to allow potential
+                // concatenated GZIP stream, so we now try and read the next
+                // GZIP stream header, if any.
+
+                // use any un-inflated remaining bytes from the stream
+                InputStream headerIS = in;
+                int remainingUnInflated = inf.getRemaining();
+                if (remainingUnInflated > 0) {
+                    headerIS = new SequenceInputStream(
+                            new ByteArrayInputStream(this.buf, this.len - remainingUnInflated,
+                                    remainingUnInflated),
+                            new FilterInputStream(in) {
+                                public void close() throws IOException {}
+                            });
+                }
+                int numHeaderBytes;
+                try {
+                    numHeaderBytes = readHeader(headerIS); // next GZIP stream header, if any
+                } catch (EOFException _) {
+                    // TODO: somehow verify that this failed when reading the
+                    //  first byte of the header?
+                    eos = true;
+                    return n; // -1
+                }
+                // reset the inflater to read the deflated content of the next GZIP stream
+                inf.reset();
+                // adjust the input to the inflater correctly past the GZIP stream header
+                if (remainingUnInflated > numHeaderBytes) {
+                    inf.setInput(this.buf, this.len - remainingUnInflated + numHeaderBytes,
+                            remainingUnInflated - numHeaderBytes);
+                }
+                // read the next GZIP stream's deflated data
                 return this.read(buf, off, len);
+            }
         } else {
             crc.update(buf, off, n);
         }
@@ -242,12 +293,12 @@ private int readHeader(InputStream this_in) throws IOException {
      * reached, false if there are more (concatenated gzip
      * data set)
      */
-    private boolean readTrailer() throws IOException {
+    private void readTrailer() throws IOException {
         InputStream in = this.in;
-        int n = inf.getRemaining();
-        if (n > 0) {
+        int remainingUnInflated = inf.getRemaining();
+        if (remainingUnInflated > 0) {
             in = new SequenceInputStream(
-                        new ByteArrayInputStream(buf, len - n, n),
+                        new ByteArrayInputStream(buf, len - remainingUnInflated, remainingUnInflated),
                         new FilterInputStream(in) {
                             public void close() throws IOException {}
                         });
@@ -255,20 +306,15 @@ public void close() throws IOException {}
         // Uses left-to-right evaluation order
         if ((readUInt(in) != crc.getValue()) ||
             // rfc1952; ISIZE is the input size modulo 2^32
-            (readUInt(in) != (inf.getBytesWritten() & 0xffffffffL)))
+            (readUInt(in) != (inf.getBytesWritten() & 0xffffffffL))) {
             throw new ZipException("Corrupt GZIP trailer");
-
-        // try concatenated case
-        int m = 8;                  // this.trailer
-        try {
-            m += readHeader(in);    // next.header
-        } catch (IOException ze) {
-            return true;  // ignore any malformed, do nothing
         }
+        int numTrailerBytes = 8;
         inf.reset();
-        if (n > m)
-            inf.setInput(buf, len - n + m, n - m);
-        return false;
+        if (remainingUnInflated > numTrailerBytes) {
+            inf.setInput(buf, len - remainingUnInflated + numTrailerBytes,
+                    remainingUnInflated - numTrailerBytes);
+        }
     }
 
     /*

It still has one or two unanswered questions that I haven't fully thought of, but I thought this diff might be useful while discussing these changes.

-------------

PR Review Comment: https://git.openjdk.org/jdk/pull/18385#discussion_r1677807848