RFR: 8347712: IllegalStateException on multithreaded ZipFile access with non-UTF8 charset [v2]
Eirik Bjørsnøs
eirbjo at openjdk.org
Wed Mar 12 18:26:00 UTC 2025
On Wed, 12 Mar 2025 06:42:43 GMT, Jaikiran Pai <jpai at openjdk.org> wrote:
>> src/java.base/share/classes/java/util/zip/ZipFile.java line 376:
>>
>>> 374: * @param pos the entry position
>>> 375: */
>>> 376: private static boolean useUTF8Coder(final byte[] cen, final int pos) {
>>
>> This seems mostly used to determine which `ZipCoder` to pick. Could we fold this into the method, calling it `zipCoderForPos` and make it just return the `ZipCoder`, like we had in `Source`?
>>
>> If it needs to be static, then the non-UTF8/fallback ZipCoder could be passed as a parameter?
>>
>> If a method to determine whether a CEN entry uses UTF-8 is still needed, then I think it would be more appropriately named `isUTF8Entry`.
>
>> This seems mostly used to determine which ZipCoder to pick. Could we fold this into the method, calling it zipCoderForPos and make it just return the ZipCoder, like we had in Source?
>
> I intentionally removed the `zipCoderForPos()` method and instead introduced this static method to avoid the case where the code ends up calling this method and then stores the returned `ZipCoder` (like was being done in `Source`). Instead with this new method, it now allows the call sites to determine if a UTF8 `ZipCoder` is needed or a non-UTF8 one and then decide where to get the non-UTF8 `ZipCoder` from. If the call site is a instance method in `ZipFile`, then it was use the `ZipFile`'s `zipCoder` field and if the call site is within `Source`, when the `Source` is being instantiated then it constructs a non-UTF8 `ZipCoder` afresh. That level of detail is better dealt at the call site instead of a method like `zipCoderForPos()`.
Hello Jaikiran,
I'm not convinced dropping `zipCoderForPos` is a step forward:
* `zipCoderForPos` included a short-circuit optimization logic which skipped reading the CENFLG field in the case where the opening charset was UTF-8 (which it commonly is, and always for JarFile). This optimization seems lost in the currrent state of this PR and could impact lookup performance. It would be strange to repeat this optimization at all call sites.
* The only thing differing between callsites seems to be which ZipCoder to use as a "fallback" when it's not UTF-8. This ZipCoder instance can easilly be passed in as a parameter, and will de-duplicate logic at the call sites.
* The only "cumbersome" call site seems to be `initCEN`, caused by the lazy initialization of the non-UTF-8 ZipCoder. But that lazy initialization seems questionable: First, ZipCoder is already lazy in the way it initializes encoders and decoders. An unused ZipCoder is just a thin wrapper around a Charset and should not incur significant cost until it gets used. Second, the `ZipFile` constructor actually constructs a `ZipCoder` instance, would it not be better to just pass this down to initCEN as a parameter? That would avoid the cost of initializing encoders and decoders twice for the common case of single-threaded `ZipFile` access, once for initCEN, then again on the first lookup.
Here's a patch for a quick implementation of the above on top of your PR. (Code speeks better than words some times :-)
This retains the CENFLG optimization, simplifies logic at call sites and prevents duplicated initialization of ZipCoder beteween initCEN and lookups:
Index: src/java.base/share/classes/java/util/zip/ZipFile.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java b/src/java.base/share/classes/java/util/zip/ZipFile.java
--- a/src/java.base/share/classes/java/util/zip/ZipFile.java (revision 935b04ed00522aa92105292baa0693c55b1356ae)
+++ b/src/java.base/share/classes/java/util/zip/ZipFile.java (date 1741800197915)
@@ -201,8 +201,8 @@
this.fileName = file.getName();
long t0 = System.nanoTime();
- this.res = new CleanableResource(this, charset, file, mode);
this.zipCoder = ZipCoder.get(charset);
+ this.res = new CleanableResource(this, charset, zipCoder, file, mode);
PerfCounter.getZipFileOpenTime().addElapsedTimeFrom(t0);
PerfCounter.getZipFileCount().increment();
@@ -368,13 +368,19 @@
}
/**
- * {@return true if the ZIP entry corresponding to the given {@code pos} uses UTF-8
- * for entry name and comment field encoding, false otherwise}
+ * {@return a ZipCoder for decoding name and comment fields for the given CEN entry position}
* @param cen the CEN
* @param pos the entry position
+ * @param fallback the ZipCoder to return when the entry is not flagged as UTF-8
*/
- private static boolean useUTF8Coder(final byte[] cen, final int pos) {
- return (CENFLG(cen, pos) & USE_UTF8) != 0;
+ private static ZipCoder zipCoderFor(final byte[] cen, final int pos, ZipCoder fallback) {
+ if (fallback.isUTF8()) {
+ return ZipCoder.UTF8;
+ }
+ if ((CENFLG(cen, pos) & USE_UTF8) != 0) {
+ return ZipCoder.UTF8;
+ }
+ return fallback;
}
private static class InflaterCleanupAction implements Runnable {
@@ -575,7 +581,7 @@
private String getEntryName(int pos) {
byte[] cen = res.zsrc.cen;
int nlen = CENNAM(cen, pos);
- ZipCoder zc = useUTF8Coder(cen, pos) ? ZipCoder.UTF8 : zipCoder;
+ ZipCoder zc = zipCoderFor(cen, pos, zipCoder);
return zc.toString(cen, pos + CENHDR, nlen);
}
@@ -646,7 +652,7 @@
}
if (clen != 0) {
int start = pos + CENHDR + nlen + elen;
- ZipCoder zc = useUTF8Coder(cen, pos) ? ZipCoder.UTF8 : zipCoder;
+ ZipCoder zc = zipCoderFor(cen, pos, zipCoder);
e.comment = zc.toString(cen, start, clen);
}
lastEntryName = e.name;
@@ -678,11 +684,11 @@
Source zsrc;
- CleanableResource(ZipFile zf, Charset charset, File file, int mode) throws IOException {
+ CleanableResource(ZipFile zf, Charset charset, ZipCoder zipCoder, File file, int mode) throws IOException {
this.cleanable = CleanerFactory.cleaner().register(zf, this);
this.istreams = Collections.newSetFromMap(new WeakHashMap<>());
this.inflaterCache = new ArrayDeque<>();
- this.zsrc = Source.get(file, (mode & OPEN_DELETE) != 0, charset);
+ this.zsrc = Source.get(file, (mode & OPEN_DELETE) != 0, charset, zipCoder);
}
void clean() {
@@ -1472,7 +1478,7 @@
private static final java.nio.file.FileSystem builtInFS =
DefaultFileSystemProvider.theFileSystem();
- static Source get(File file, boolean toDelete, Charset charset) throws IOException {
+ static Source get(File file, boolean toDelete, Charset charset, ZipCoder zipCoder) throws IOException {
final Key key;
try {
key = new Key(file,
@@ -1489,7 +1495,7 @@
return src;
}
}
- src = new Source(key, toDelete);
+ src = new Source(key, toDelete, zipCoder);
synchronized (files) {
Source prev = files.putIfAbsent(key, src);
@@ -1511,7 +1517,7 @@
}
}
- private Source(Key key, boolean toDelete) throws IOException {
+ private Source(Key key, boolean toDelete, ZipCoder zipCoder) throws IOException {
this.key = key;
if (toDelete) {
if (OperatingSystem.isWindows()) {
@@ -1525,7 +1531,7 @@
this.zfile = new RandomAccessFile(key.file, "r");
}
try {
- initCEN(-1);
+ initCEN(-1, zipCoder);
byte[] buf = new byte[4];
readFullyAt(buf, 0, 4, 0);
this.startsWithLoc = (LOCSIG(buf) == LOCSIG);
@@ -1680,7 +1686,7 @@
}
// Reads ZIP file central directory.
- private void initCEN(int knownTotal) throws IOException {
+ private void initCEN(int knownTotal, ZipCoder zipCoder) throws IOException {
// Prefer locals for better performance during startup
byte[] cen;
if (knownTotal == -1) {
@@ -1736,31 +1742,19 @@
int idx = 0; // Index into the entries array
int pos = 0;
manifestNum = 0;
- ZipCoder zipCoder = null;
int limit = cen.length - CENHDR;
while (pos <= limit) {
if (idx >= entriesLength) {
// This will only happen if the ZIP file has an incorrect
// ENDTOT field, which usually means it contains more than
// 65535 entries.
- initCEN(countCENHeaders(cen));
+ initCEN(countCENHeaders(cen), zipCoder);
return;
}
int entryPos = pos + CENHDR;
- // the ZipCoder for any non-UTF8 entries
- final ZipCoder entryZipCoder;
- if (useUTF8Coder(cen, pos)) {
- // entry name explicitly wants UTF-8 Charset
- entryZipCoder = ZipCoder.UTF8;
- } else {
- // use the ZipCoder corresponding to the Charset that
- // was provided when constructing the ZipFile
- if (zipCoder == null) {
- zipCoder = ZipCoder.get(key.charset);
- }
- entryZipCoder = zipCoder;
- }
+ // the ZipCoder for this entry
+ final ZipCoder entryZipCoder = zipCoderFor(cen, pos, zipCoder);
// Checks the entry and adds values to entries[idx ... idx+2]
int nlen = checkAndAddEntry(pos, idx, entryZipCoder);
idx += 3;
@@ -1853,15 +1847,7 @@
int noff = pos + CENHDR;
int nlen = CENNAM(cen, pos);
- final ZipCoder zc;
- if (useUTF8Coder(cen, pos)) {
- // entry name explicitly wants UTF-8 Charset
- zc = ZipCoder.UTF8;
- } else {
- // use the ZipCoder corresponding to the Charset that
- // was provided when constructing the ZipFile
- zc = zipCoder;
- }
+ final ZipCoder zc = zipCoderFor(cen, pos, zipCoder);
// Compare the lookup name with the name encoded in the CEN
switch (zc.compare(name, cen, noff, nlen, addSlash)) {
case ZipCoder.EXACT_MATCH:
-------------
PR Review Comment: https://git.openjdk.org/jdk/pull/23986#discussion_r1992069024
More information about the core-libs-dev
mailing list