Enhancements to the java.lang.StringCoding class

Martin Buchholz martinrb at google.com
Sun Jul 2 23:05:03 UTC 2017


Very high level:

UTF-16 is not expected to be a popular encoding for text outside the JDK.
Everyone is supposed to be migrating to UTF-8 from ISO-8859-1 and other
legacy encodings.

The fact that people (like you and I) are writing specialized
encoders/decoders outside of the "real" charset implementations for better
performance suggests that the nio charset API could be rethought.

On Sun, Jul 2, 2017 at 12:22 PM, John Platts <john_platts at hotmail.com>
wrote:

> I was looking at the OpenJDK 9 code, and I noticed that optimizations for
> encoding and decoding from UTF-16 text could be added to the
> java.lang.StringCoding class.
>
> Here is how the optimized UTF-16 decoding could be implemented in
> java.lang.StringCoding:
> private static void byteSwapUTF16(byte[] arr, int start) {
>     for(int i = start; i < arr.length; i += 2) {
>         byte b1 = arr[i];
>         byte b2 = arr[i + 1];
>
>
>         arr[i] = b2;
>         arr[i + 1] = b1;
>     }
> }
>
> static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) {
>     byte[] result;
>
>     if(coder == LATIN1) {
>         result = new byte[(val.length + (includeBOM ? 1 : 0)) << 1];
>         int resultStartOffset = includeBOM ? 2 : 0;
>
>         if(includeBOM) {
>             result[0] = (byte)0xFE;
>             result[1] = (byte)0xFF;
>         }
>
>         for(int i = 0; i < val.length; i++) {
>             result[resultStartOffset + (i << 1) + 1] = val[i];
>         }
>     } else {
>         result = new byte[val.length + (includeBOM ? 2 : 0)];
>         int resultStartOffset = includeBOM ? 2 : 0;
>
>         if(includeBOM) {
>             result[0] = (byte)0xFE;
>             result[1] = (byte)0xFF;
>         }
>
>         System.arraycopy(val, 0, result, resultStartOffset, val.length);
>
>         if(StringUTF16.HI_BYTE_SHIFT == 0) {
>             // val is encoded using little-endian UTF-16
>             // Convert to big-endian UTF-16 from little-endian UTF-16
>             byteSwapUTF16(result, resultStartOffset);
>         }
>
>         for(int i = resultStartOffset; i < result.length; i += 2) {
>             int b1 = Byte.toUnsignedInt(result[i]);
>             int b3 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i
> + 2]) : -1;
>             if(b1 >= 0xD8 && b1 <= 0xDF) {
>                 if(b1 <= 0xDB && b3 >= 0xDC && b3 <= 0xDF) {
>                     // UTF-16 surrogate pair encountered
>
>                     // Advance i to the position of the low surrogate
>                     i += 2;
>
>                     // Continue the loop past the low surrogate
>                     continue;
>                 }
>
>                 // Unpaired surrogate character encountered
>                 // Replace unpaired surrogate character with U+FFFD
>                 result[i] = (byte)0xFF;
>                 result[i + 1] = (byte)0xFD;
>             }
>         }
>     }
>
>     return result;
> }
>
> static byte[] encodeUTF16LE(byte coder, byte[] val) {
>     byte[] result;
>
>     if(coder == LATIN1) {
>         result = new byte[val.length << 1];
>
>         for(int i = 0; i < val.length; i++) {
>             result[i << 1] = val[i];
>         }
>     } else {
>         result = val.clone();
>
>         if(StringUTF16.LO_BYTE_SHIFT == 0) {
>             // val is encoded using big-endian UTF-16
>
>             // Convert result to little-endian UTF-16 from big-endian
> UTF-16 by byte swapping
>             byteSwapUTF16(result, 0);
>         }
>
>         for(int i = 0; i < result.length; i += 2) {
>             int b2 = Byte.toUnsignedInt(result[i + 1]);
>             int b4 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i
> + 3]) : -1;
>             if(b2 >= 0xD8 && b2 <= 0xDF) {
>                 if(b2 <= 0xDB && b4 >= 0xDC && b4 <= 0xDF) {
>                     // UTF-16 surrogate pair encountered
>
>                     // Advance i to the position of the low surrogate
>                     i += 2;
>
>                     // Continue the loop past the low surrogate
>                     continue;
>                 }
>
>                 // Unpaired surrogate character encountered
>                 // Replace unpaired surrogate character with U+FFFD
>                 result[i] = (byte)0xFD;
>                 result[i + 1] = (byte)0xFF;
>             }
>         }
>     }
>
>     return result;
> }
>
> static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) {
>     boolean bigEndian = true;
>
>     if(len >= 2) {
>         int b1 = Byte.toUnsignedInt(ba[off]);
>         int b2 = Byte.toUnsignedInt(ba[off + 1]);
>         if(b1 == 0xFE && b2 == 0xFF) {
>             // Big-endian BOM detected
>             off += 2;
>             len -= 2;
>         } else if(b1 == 0xFF && b2 == 0xFE) {
>             // Little-endian BOM detected
>             off += 2;
>             len -= 2;
>             bigEndian = false;
>         }
>     }
>
>     return decodeUTF16(ba, off, len, bigEndian);
> }
>
>
> static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) {
>     Result result = new Result();
>
>     if(len == 0) {
>         return result.with();
>     }
>
>     byte[] decodedArr;
>     if(COMPACT_STRINGS && (len & 1) == 0) {
>         // Check for non-Latin1 characters
>         boolean containsNonLatin1 = false;
>         for(int i = 0; i < len; i += 2) {
>             if(ba[off + i + (bigEndian ? 0 : 1)] != 0) {
>                 containsNonLatin1 = true;
>                 break;
>             }
>         }
>
>         // If the input only contains Latin1 characters, copy the source
> characters
>         // to a Latin1-encoded byte array, and return the decoded text.
>         if(!containsNonLatin1) {
>             decodedArr = new byte[len >> 1];
>
>             for(int i = 0; i < decodedArr.length; i++) {
>                 decodedArr[i] = ba[off + (i << 1) + (bigEndian ? 1 : 0)];
>             }
>
>             return result.with(decodedArr, LATIN1);
>         }
>     }
>
>     decodedArr = new byte[len + (len & 1)];
>     System.arraycopy(ba, off, decodedArr, 0, len);
>
>     if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) {
>         // Input byte order does not match system byte order
>
>         // Byte swap decodedArr so that decodedArr is in system byte order
>         byteSwapUTF16(decodedArr, 0);
>     }
>
>     // decodedArr is now in system byte order
>
>     if((len & 1) != 0) {
>         // If len is odd, then there is a malformed character at the end.
>
>         // Replace the last character in decodedArr with U+FFFD if this is
> the case.
>         StringUTF16.putChar(decodedArr, (decodedArr.length >> 1) - 1,
> 0xFFFD);
>
>         // Decrement len by 1 to make len even.
>         len--;
>     }
>
>     // len is now even
>
>     // charLen is equal to the number of UTF-16 characters in decodedArr
>     int charLen = len >> 1;
>
>     // replace the reversed BOM and unpaired surrogates with U+FFFD
>     for(int i = 0; i < charLen; i++) {
>         char ch = StringUTF16.getChar(decodedArr, i);
>
>         if(charLen - i >= 2 &&
>             Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr,
> i + 1)) {
>             // Surrogate pair detected
>
>             // Increment i to the position of the low surrogate
>             i++;
>
>             // Continue the loop
>             continue;
>         }
>
>         if(ch == (char)0xFFFE || Character.isSurrogate(ch)) {
>             // Reversed BOM or unpaired surrogate encountered
>
>             // Replace ch with 0xFFFD
>             StringUTF16.putChar(decodedArr, i, (char)0xFFFD);
>         }
>     }
>
>     // If compact strings are enabled, return a Latin1-encoded result if
> the result
>     // does not contain any non-Latin-1 characters.
>     if(COMPACT_STRINGS) {
>         byte[] compressedArr = StringUTF16.compress(decodedArr, 0,
> decodedArr.len);
>         if(compressedArr != null) {
>             return result.with(compressedArr, LATIN1);
>         }
>     }
>
>     return result.with(decodedArr, UTF16);
> }
>
> private static class StringDecoderUTF_16 extends StringDecoder {
>         StringDecoderUTF_16(Charset cs, String rcn) {
>             super(cs, rcn);
>         }
>         Result decode(byte[] ba, int off, int len) {
>             return bomDetectDecodeUTF16(ba, off, len);
>         }
> }
>
> private static class StringDecoderUTF_16LE extends StringDecoder {
>         StringDecoderUTF_16(Charset cs, String rcn) {
>             super(cs, rcn);
>         }
>         Result decode(byte[] ba, int off, int len) {
>             return decodeUTF16(ba, off, len, false);
>         }
> }
>
> private static class StringDecoderUTF_16BE extends StringDecoder {
>         StringDecoderUTF_16(Charset cs, String rcn) {
>             super(cs, rcn);
>         }
>         Result decode(byte[] ba, int off, int len) {
>             return decodeUTF16(ba, off, len, true);
>         }
> }
>
> static Result decode(String charsetName, byte[] ba, int off, int len)
>         throws UnsupportedEncodingException
>     {
>         StringDecoder sd = deref(decoder);
>         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
>         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
>                               || csn.equals(sd.charsetName()))) {
>             sd = null;
>             try {
>                 Charset cs = lookupCharset(csn);
>                 if (cs != null) {
>                     if (cs == UTF_8) {
>                         sd = new StringDecoderUTF8(cs, csn);
>                     } else if (cs == ISO_8859_1) {
>                         sd = new StringDecoder8859_1(cs, csn);
>                     } else if(cs == StandardCharsets.UTF_16) {
>                         sd = new StringDecoderUTF_16(cs, csn);
>                     } else if(cs == StandardCharsets.UTF_16LE) {
>                         sd = new StringDecoderUTF_16LE(cs, csn);
>                     } else if(cs == StandardCharsets.UTF_16BE) {
>                         sd = new StringDecoderUTF_16BE(cs, csn);
>                     } else {
>                         sd = new StringDecoder(cs, csn);
>                     }
>                 }
>             } catch (IllegalCharsetNameException x) {}
>             if (sd == null)
>                 throw new UnsupportedEncodingException(csn);
>             set(decoder, sd);
>         }
>         return sd.decode(ba, off, len);
>     }
> }
>
> static byte[] encode(Charset cs, byte coder, byte[] val) {
>         if (cs == UTF_8) {
>             return encodeUTF8(coder, val);
>         } else if (cs == ISO_8859_1) {
>             return encode8859_1(coder, val);
>         } else if (cs == US_ASCII) {
>             return encodeASCII(coder, val);
>         } else if (cs == StandardCharsets.UTF_16 || cs ==
> StandardCharsets.UTF_16BE){
>             return encodeUTF16BE(coder, val, cs ==
> StandardCharsets.UTF_16);
>         } else if (cs == StandardCharsets.UTF_16LE) {
>             return encodeUTF16LE(coder, val);
>         }
>         CharsetEncoder ce = cs.newEncoder();
>         // fastpath for ascii compatible
>         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
>                                  ((ArrayEncoder)ce).isASCIICompatible() &&
>                                  !hasNegatives(val, 0, val.length)))) {
>             return Arrays.copyOf(val, val.length);
>         }
>         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
>         int en = scale(len, ce.maxBytesPerChar());
>         byte[] ba = new byte[en];
>         if (len == 0) {
>             return ba;
>         }
>         boolean isTrusted = System.getSecurityManager() == null ||
>                             cs.getClass().getClassLoader0() == null;
>         ce.onMalformedInput(CodingErrorAction.REPLACE)
>           .onUnmappableCharacter(CodingErrorAction.REPLACE)
>           .reset();
>         if (ce instanceof ArrayEncoder) {
>             if (!isTrusted) {
>                 val = Arrays.copyOf(val, val.length);
>             }
>             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val,
> 0, len, ba)
>                                           : ((ArrayEncoder)ce).encodeFromUTF16(val,
> 0, len, ba);
>             if (blen != -1) {
>                 return safeTrim(ba, blen, isTrusted);
>             }
>         }
>         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
>                                        : StringUTF16.toChars(val);
>         ByteBuffer bb = ByteBuffer.wrap(ba);
>         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
>         try {
>             CoderResult cr = ce.encode(cb, bb, true);
>             if (!cr.isUnderflow())
>                 cr.throwException();
>             cr = ce.flush(bb);
>             if (!cr.isUnderflow())
>                 cr.throwException();
>         } catch (CharacterCodingException x) {
>             throw new Error(x);
>         }
>         return safeTrim(ba, bb.position(), isTrusted);
> }


More information about the core-libs-dev mailing list