Enhancements to the java.lang.StringCoding class

Sun Jul 2 19:22:49 UTC 2017

I was looking at the OpenJDK 9 code, and I noticed that optimizations for encoding and decoding from UTF-16 text could be added to the java.lang.StringCoding class.

Here is how the optimized UTF-16 decoding could be implemented in java.lang.StringCoding:
private static void byteSwapUTF16(byte[] arr, int start) {
    for(int i = start; i < arr.length; i += 2) {
        byte b1 = arr[i];
        byte b2 = arr[i + 1];

        arr[i] = b2;
        arr[i + 1] = b1;
    }
}

static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) {
    byte[] result;

    if(coder == LATIN1) {
        result = new byte[(val.length + (includeBOM ? 1 : 0)) << 1];
        int resultStartOffset = includeBOM ? 2 : 0;

        if(includeBOM) {
            result[0] = (byte)0xFE;
            result[1] = (byte)0xFF;
        }

        for(int i = 0; i < val.length; i++) {
            result[resultStartOffset + (i << 1) + 1] = val[i];
        }
    } else {
        result = new byte[val.length + (includeBOM ? 2 : 0)];
        int resultStartOffset = includeBOM ? 2 : 0;

        if(includeBOM) {
            result[0] = (byte)0xFE;
            result[1] = (byte)0xFF;
        }

        System.arraycopy(val, 0, result, resultStartOffset, val.length);

        if(StringUTF16.HI_BYTE_SHIFT == 0) {
            // val is encoded using little-endian UTF-16
            // Convert to big-endian UTF-16 from little-endian UTF-16
            byteSwapUTF16(result, resultStartOffset);
        }

        for(int i = resultStartOffset; i < result.length; i += 2) {
            int b1 = Byte.toUnsignedInt(result[i]);
            int b3 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i + 2]) : -1;
            if(b1 >= 0xD8 && b1 <= 0xDF) {
                if(b1 <= 0xDB && b3 >= 0xDC && b3 <= 0xDF) {
                    // UTF-16 surrogate pair encountered

                    // Advance i to the position of the low surrogate
                    i += 2;

                    // Continue the loop past the low surrogate
                    continue;
                }

                // Unpaired surrogate character encountered
                // Replace unpaired surrogate character with U+FFFD
                result[i] = (byte)0xFF;
                result[i + 1] = (byte)0xFD;
            }
        }
    }

    return result;
}

static byte[] encodeUTF16LE(byte coder, byte[] val) {
    byte[] result;

    if(coder == LATIN1) {
        result = new byte[val.length << 1];

        for(int i = 0; i < val.length; i++) {
            result[i << 1] = val[i];
        }
    } else {
        result = val.clone();

        if(StringUTF16.LO_BYTE_SHIFT == 0) {
            // val is encoded using big-endian UTF-16

            // Convert result to little-endian UTF-16 from big-endian UTF-16 by byte swapping
            byteSwapUTF16(result, 0);
        }

        for(int i = 0; i < result.length; i += 2) {
            int b2 = Byte.toUnsignedInt(result[i + 1]);
            int b4 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i + 3]) : -1;
            if(b2 >= 0xD8 && b2 <= 0xDF) {
                if(b2 <= 0xDB && b4 >= 0xDC && b4 <= 0xDF) {
                    // UTF-16 surrogate pair encountered

                    // Advance i to the position of the low surrogate
                    i += 2;

                    // Continue the loop past the low surrogate
                    continue;
                }

                // Unpaired surrogate character encountered
                // Replace unpaired surrogate character with U+FFFD
                result[i] = (byte)0xFD;
                result[i + 1] = (byte)0xFF;
            }
        }
    }

    return result;
}

static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) {
    boolean bigEndian = true;

    if(len >= 2) {
        int b1 = Byte.toUnsignedInt(ba[off]);
        int b2 = Byte.toUnsignedInt(ba[off + 1]);
        if(b1 == 0xFE && b2 == 0xFF) {
            // Big-endian BOM detected
            off += 2;
            len -= 2;
        } else if(b1 == 0xFF && b2 == 0xFE) {
            // Little-endian BOM detected
            off += 2;
            len -= 2;
            bigEndian = false;
        }
    }

    return decodeUTF16(ba, off, len, bigEndian);
}

static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) {
    Result result = new Result();

    if(len == 0) {
        return result.with();
    }

    byte[] decodedArr;    
    if(COMPACT_STRINGS && (len & 1) == 0) {
        // Check for non-Latin1 characters
        boolean containsNonLatin1 = false;
        for(int i = 0; i < len; i += 2) {
            if(ba[off + i + (bigEndian ? 0 : 1)] != 0) {
                containsNonLatin1 = true;
                break;
            }
        }

        // If the input only contains Latin1 characters, copy the source characters
        // to a Latin1-encoded byte array, and return the decoded text.
        if(!containsNonLatin1) {
            decodedArr = new byte[len >> 1];

            for(int i = 0; i < decodedArr.length; i++) {
                decodedArr[i] = ba[off + (i << 1) + (bigEndian ? 1 : 0)];
            }

            return result.with(decodedArr, LATIN1);
        }
    }

    decodedArr = new byte[len + (len & 1)];
    System.arraycopy(ba, off, decodedArr, 0, len);

    if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) {
        // Input byte order does not match system byte order

        // Byte swap decodedArr so that decodedArr is in system byte order
        byteSwapUTF16(decodedArr, 0);
    }

    // decodedArr is now in system byte order

    if((len & 1) != 0) {
        // If len is odd, then there is a malformed character at the end.

        // Replace the last character in decodedArr with U+FFFD if this is the case.
        StringUTF16.putChar(decodedArr, (decodedArr.length >> 1) - 1, 0xFFFD);

        // Decrement len by 1 to make len even.
        len--;
    }

    // len is now even

    // charLen is equal to the number of UTF-16 characters in decodedArr
    int charLen = len >> 1;

    // replace the reversed BOM and unpaired surrogates with U+FFFD
    for(int i = 0; i < charLen; i++) {
        char ch = StringUTF16.getChar(decodedArr, i);

        if(charLen - i >= 2 &&
            Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr, i + 1)) {
            // Surrogate pair detected

            // Increment i to the position of the low surrogate
            i++;

            // Continue the loop
            continue;
        }

        if(ch == (char)0xFFFE || Character.isSurrogate(ch)) {
            // Reversed BOM or unpaired surrogate encountered

            // Replace ch with 0xFFFD
            StringUTF16.putChar(decodedArr, i, (char)0xFFFD);
        }
    }

    // If compact strings are enabled, return a Latin1-encoded result if the result
    // does not contain any non-Latin-1 characters.
    if(COMPACT_STRINGS) {
        byte[] compressedArr = StringUTF16.compress(decodedArr, 0, decodedArr.len);
        if(compressedArr != null) {
            return result.with(compressedArr, LATIN1);
        }
    }

    return result.with(decodedArr, UTF16);
}

private static class StringDecoderUTF_16 extends StringDecoder {
        StringDecoderUTF_16(Charset cs, String rcn) {
            super(cs, rcn);
        }
        Result decode(byte[] ba, int off, int len) {
            return bomDetectDecodeUTF16(ba, off, len);
        }
}

private static class StringDecoderUTF_16LE extends StringDecoder {
        StringDecoderUTF_16(Charset cs, String rcn) {
            super(cs, rcn);
        }
        Result decode(byte[] ba, int off, int len) {
            return decodeUTF16(ba, off, len, false);
        }
}

private static class StringDecoderUTF_16BE extends StringDecoder {
        StringDecoderUTF_16(Charset cs, String rcn) {
            super(cs, rcn);
        }
        Result decode(byte[] ba, int off, int len) {
            return decodeUTF16(ba, off, len, true);
        }
}

static Result decode(String charsetName, byte[] ba, int off, int len)
        throws UnsupportedEncodingException
    {
        StringDecoder sd = deref(decoder);
        String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
        if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
                              || csn.equals(sd.charsetName()))) {
            sd = null;
            try {
                Charset cs = lookupCharset(csn);
                if (cs != null) {
                    if (cs == UTF_8) {
                        sd = new StringDecoderUTF8(cs, csn);
                    } else if (cs == ISO_8859_1) {
                        sd = new StringDecoder8859_1(cs, csn);
                    } else if(cs == StandardCharsets.UTF_16) {
                        sd = new StringDecoderUTF_16(cs, csn);
                    } else if(cs == StandardCharsets.UTF_16LE) {
                        sd = new StringDecoderUTF_16LE(cs, csn);
                    } else if(cs == StandardCharsets.UTF_16BE) {
                        sd = new StringDecoderUTF_16BE(cs, csn);
                    } else {
                        sd = new StringDecoder(cs, csn);
                    }
                }
            } catch (IllegalCharsetNameException x) {}
            if (sd == null)
                throw new UnsupportedEncodingException(csn);
            set(decoder, sd);
        }
        return sd.decode(ba, off, len);
    }
}

static byte[] encode(Charset cs, byte coder, byte[] val) {
        if (cs == UTF_8) {
            return encodeUTF8(coder, val);
        } else if (cs == ISO_8859_1) {
            return encode8859_1(coder, val);
        } else if (cs == US_ASCII) {
            return encodeASCII(coder, val);
        } else if (cs == StandardCharsets.UTF_16 || cs == StandardCharsets.UTF_16BE){
            return encodeUTF16BE(coder, val, cs == StandardCharsets.UTF_16);
        } else if (cs == StandardCharsets.UTF_16LE) {
            return encodeUTF16LE(coder, val);
        }
        CharsetEncoder ce = cs.newEncoder();
        // fastpath for ascii compatible
        if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
                                 ((ArrayEncoder)ce).isASCIICompatible() &&
                                 !hasNegatives(val, 0, val.length)))) {
            return Arrays.copyOf(val, val.length);
        }
        int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
        int en = scale(len, ce.maxBytesPerChar());
        byte[] ba = new byte[en];
        if (len == 0) {
            return ba;
        }
        boolean isTrusted = System.getSecurityManager() == null ||
                            cs.getClass().getClassLoader0() == null;
        ce.onMalformedInput(CodingErrorAction.REPLACE)
          .onUnmappableCharacter(CodingErrorAction.REPLACE)
          .reset();
        if (ce instanceof ArrayEncoder) {
            if (!isTrusted) {
                val = Arrays.copyOf(val, val.length);
            }
            int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
                                          : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
            if (blen != -1) {
                return safeTrim(ba, blen, isTrusted);
            }
        }
        char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
                                       : StringUTF16.toChars(val);
        ByteBuffer bb = ByteBuffer.wrap(ba);
        CharBuffer cb = CharBuffer.wrap(ca, 0, len);
        try {
            CoderResult cr = ce.encode(cb, bb, true);
            if (!cr.isUnderflow())
                cr.throwException();
            cr = ce.flush(bb);
            if (!cr.isUnderflow())
                cr.throwException();
        } catch (CharacterCodingException x) {
            throw new Error(x);
        }
        return safeTrim(ba, bb.position(), isTrusted);
}