Enhancements to the java.lang.StringCoding class
Martin Buchholz
martinrb at google.com
Sun Jul 2 23:05:03 UTC 2017
Very high level:
UTF-16 is not expected to be a popular encoding for text outside the JDK.
Everyone is supposed to be migrating to UTF-8 from ISO-8859-1 and other
legacy encodings.
The fact that people (like you and I) are writing specialized
encoders/decoders outside of the "real" charset implementations for better
performance suggests that the nio charset API could be rethought.
On Sun, Jul 2, 2017 at 12:22 PM, John Platts <john_platts at hotmail.com>
wrote:
> I was looking at the OpenJDK 9 code, and I noticed that optimizations for
> encoding and decoding from UTF-16 text could be added to the
> java.lang.StringCoding class.
>
> Here is how the optimized UTF-16 decoding could be implemented in
> java.lang.StringCoding:
> private static void byteSwapUTF16(byte[] arr, int start) {
> for(int i = start; i < arr.length; i += 2) {
> byte b1 = arr[i];
> byte b2 = arr[i + 1];
>
>
> arr[i] = b2;
> arr[i + 1] = b1;
> }
> }
>
> static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) {
> byte[] result;
>
> if(coder == LATIN1) {
> result = new byte[(val.length + (includeBOM ? 1 : 0)) << 1];
> int resultStartOffset = includeBOM ? 2 : 0;
>
> if(includeBOM) {
> result[0] = (byte)0xFE;
> result[1] = (byte)0xFF;
> }
>
> for(int i = 0; i < val.length; i++) {
> result[resultStartOffset + (i << 1) + 1] = val[i];
> }
> } else {
> result = new byte[val.length + (includeBOM ? 2 : 0)];
> int resultStartOffset = includeBOM ? 2 : 0;
>
> if(includeBOM) {
> result[0] = (byte)0xFE;
> result[1] = (byte)0xFF;
> }
>
> System.arraycopy(val, 0, result, resultStartOffset, val.length);
>
> if(StringUTF16.HI_BYTE_SHIFT == 0) {
> // val is encoded using little-endian UTF-16
> // Convert to big-endian UTF-16 from little-endian UTF-16
> byteSwapUTF16(result, resultStartOffset);
> }
>
> for(int i = resultStartOffset; i < result.length; i += 2) {
> int b1 = Byte.toUnsignedInt(result[i]);
> int b3 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i
> + 2]) : -1;
> if(b1 >= 0xD8 && b1 <= 0xDF) {
> if(b1 <= 0xDB && b3 >= 0xDC && b3 <= 0xDF) {
> // UTF-16 surrogate pair encountered
>
> // Advance i to the position of the low surrogate
> i += 2;
>
> // Continue the loop past the low surrogate
> continue;
> }
>
> // Unpaired surrogate character encountered
> // Replace unpaired surrogate character with U+FFFD
> result[i] = (byte)0xFF;
> result[i + 1] = (byte)0xFD;
> }
> }
> }
>
> return result;
> }
>
> static byte[] encodeUTF16LE(byte coder, byte[] val) {
> byte[] result;
>
> if(coder == LATIN1) {
> result = new byte[val.length << 1];
>
> for(int i = 0; i < val.length; i++) {
> result[i << 1] = val[i];
> }
> } else {
> result = val.clone();
>
> if(StringUTF16.LO_BYTE_SHIFT == 0) {
> // val is encoded using big-endian UTF-16
>
> // Convert result to little-endian UTF-16 from big-endian
> UTF-16 by byte swapping
> byteSwapUTF16(result, 0);
> }
>
> for(int i = 0; i < result.length; i += 2) {
> int b2 = Byte.toUnsignedInt(result[i + 1]);
> int b4 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i
> + 3]) : -1;
> if(b2 >= 0xD8 && b2 <= 0xDF) {
> if(b2 <= 0xDB && b4 >= 0xDC && b4 <= 0xDF) {
> // UTF-16 surrogate pair encountered
>
> // Advance i to the position of the low surrogate
> i += 2;
>
> // Continue the loop past the low surrogate
> continue;
> }
>
> // Unpaired surrogate character encountered
> // Replace unpaired surrogate character with U+FFFD
> result[i] = (byte)0xFD;
> result[i + 1] = (byte)0xFF;
> }
> }
> }
>
> return result;
> }
>
> static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) {
> boolean bigEndian = true;
>
> if(len >= 2) {
> int b1 = Byte.toUnsignedInt(ba[off]);
> int b2 = Byte.toUnsignedInt(ba[off + 1]);
> if(b1 == 0xFE && b2 == 0xFF) {
> // Big-endian BOM detected
> off += 2;
> len -= 2;
> } else if(b1 == 0xFF && b2 == 0xFE) {
> // Little-endian BOM detected
> off += 2;
> len -= 2;
> bigEndian = false;
> }
> }
>
> return decodeUTF16(ba, off, len, bigEndian);
> }
>
>
> static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) {
> Result result = new Result();
>
> if(len == 0) {
> return result.with();
> }
>
> byte[] decodedArr;
> if(COMPACT_STRINGS && (len & 1) == 0) {
> // Check for non-Latin1 characters
> boolean containsNonLatin1 = false;
> for(int i = 0; i < len; i += 2) {
> if(ba[off + i + (bigEndian ? 0 : 1)] != 0) {
> containsNonLatin1 = true;
> break;
> }
> }
>
> // If the input only contains Latin1 characters, copy the source
> characters
> // to a Latin1-encoded byte array, and return the decoded text.
> if(!containsNonLatin1) {
> decodedArr = new byte[len >> 1];
>
> for(int i = 0; i < decodedArr.length; i++) {
> decodedArr[i] = ba[off + (i << 1) + (bigEndian ? 1 : 0)];
> }
>
> return result.with(decodedArr, LATIN1);
> }
> }
>
> decodedArr = new byte[len + (len & 1)];
> System.arraycopy(ba, off, decodedArr, 0, len);
>
> if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) {
> // Input byte order does not match system byte order
>
> // Byte swap decodedArr so that decodedArr is in system byte order
> byteSwapUTF16(decodedArr, 0);
> }
>
> // decodedArr is now in system byte order
>
> if((len & 1) != 0) {
> // If len is odd, then there is a malformed character at the end.
>
> // Replace the last character in decodedArr with U+FFFD if this is
> the case.
> StringUTF16.putChar(decodedArr, (decodedArr.length >> 1) - 1,
> 0xFFFD);
>
> // Decrement len by 1 to make len even.
> len--;
> }
>
> // len is now even
>
> // charLen is equal to the number of UTF-16 characters in decodedArr
> int charLen = len >> 1;
>
> // replace the reversed BOM and unpaired surrogates with U+FFFD
> for(int i = 0; i < charLen; i++) {
> char ch = StringUTF16.getChar(decodedArr, i);
>
> if(charLen - i >= 2 &&
> Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr,
> i + 1)) {
> // Surrogate pair detected
>
> // Increment i to the position of the low surrogate
> i++;
>
> // Continue the loop
> continue;
> }
>
> if(ch == (char)0xFFFE || Character.isSurrogate(ch)) {
> // Reversed BOM or unpaired surrogate encountered
>
> // Replace ch with 0xFFFD
> StringUTF16.putChar(decodedArr, i, (char)0xFFFD);
> }
> }
>
> // If compact strings are enabled, return a Latin1-encoded result if
> the result
> // does not contain any non-Latin-1 characters.
> if(COMPACT_STRINGS) {
> byte[] compressedArr = StringUTF16.compress(decodedArr, 0,
> decodedArr.len);
> if(compressedArr != null) {
> return result.with(compressedArr, LATIN1);
> }
> }
>
> return result.with(decodedArr, UTF16);
> }
>
> private static class StringDecoderUTF_16 extends StringDecoder {
> StringDecoderUTF_16(Charset cs, String rcn) {
> super(cs, rcn);
> }
> Result decode(byte[] ba, int off, int len) {
> return bomDetectDecodeUTF16(ba, off, len);
> }
> }
>
> private static class StringDecoderUTF_16LE extends StringDecoder {
> StringDecoderUTF_16(Charset cs, String rcn) {
> super(cs, rcn);
> }
> Result decode(byte[] ba, int off, int len) {
> return decodeUTF16(ba, off, len, false);
> }
> }
>
> private static class StringDecoderUTF_16BE extends StringDecoder {
> StringDecoderUTF_16(Charset cs, String rcn) {
> super(cs, rcn);
> }
> Result decode(byte[] ba, int off, int len) {
> return decodeUTF16(ba, off, len, true);
> }
> }
>
> static Result decode(String charsetName, byte[] ba, int off, int len)
> throws UnsupportedEncodingException
> {
> StringDecoder sd = deref(decoder);
> String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
> if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
> || csn.equals(sd.charsetName()))) {
> sd = null;
> try {
> Charset cs = lookupCharset(csn);
> if (cs != null) {
> if (cs == UTF_8) {
> sd = new StringDecoderUTF8(cs, csn);
> } else if (cs == ISO_8859_1) {
> sd = new StringDecoder8859_1(cs, csn);
> } else if(cs == StandardCharsets.UTF_16) {
> sd = new StringDecoderUTF_16(cs, csn);
> } else if(cs == StandardCharsets.UTF_16LE) {
> sd = new StringDecoderUTF_16LE(cs, csn);
> } else if(cs == StandardCharsets.UTF_16BE) {
> sd = new StringDecoderUTF_16BE(cs, csn);
> } else {
> sd = new StringDecoder(cs, csn);
> }
> }
> } catch (IllegalCharsetNameException x) {}
> if (sd == null)
> throw new UnsupportedEncodingException(csn);
> set(decoder, sd);
> }
> return sd.decode(ba, off, len);
> }
> }
>
> static byte[] encode(Charset cs, byte coder, byte[] val) {
> if (cs == UTF_8) {
> return encodeUTF8(coder, val);
> } else if (cs == ISO_8859_1) {
> return encode8859_1(coder, val);
> } else if (cs == US_ASCII) {
> return encodeASCII(coder, val);
> } else if (cs == StandardCharsets.UTF_16 || cs ==
> StandardCharsets.UTF_16BE){
> return encodeUTF16BE(coder, val, cs ==
> StandardCharsets.UTF_16);
> } else if (cs == StandardCharsets.UTF_16LE) {
> return encodeUTF16LE(coder, val);
> }
> CharsetEncoder ce = cs.newEncoder();
> // fastpath for ascii compatible
> if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
> ((ArrayEncoder)ce).isASCIICompatible() &&
> !hasNegatives(val, 0, val.length)))) {
> return Arrays.copyOf(val, val.length);
> }
> int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
> int en = scale(len, ce.maxBytesPerChar());
> byte[] ba = new byte[en];
> if (len == 0) {
> return ba;
> }
> boolean isTrusted = System.getSecurityManager() == null ||
> cs.getClass().getClassLoader0() == null;
> ce.onMalformedInput(CodingErrorAction.REPLACE)
> .onUnmappableCharacter(CodingErrorAction.REPLACE)
> .reset();
> if (ce instanceof ArrayEncoder) {
> if (!isTrusted) {
> val = Arrays.copyOf(val, val.length);
> }
> int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val,
> 0, len, ba)
> : ((ArrayEncoder)ce).encodeFromUTF16(val,
> 0, len, ba);
> if (blen != -1) {
> return safeTrim(ba, blen, isTrusted);
> }
> }
> char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
> : StringUTF16.toChars(val);
> ByteBuffer bb = ByteBuffer.wrap(ba);
> CharBuffer cb = CharBuffer.wrap(ca, 0, len);
> try {
> CoderResult cr = ce.encode(cb, bb, true);
> if (!cr.isUnderflow())
> cr.throwException();
> cr = ce.flush(bb);
> if (!cr.isUnderflow())
> cr.throwException();
> } catch (CharacterCodingException x) {
> throw new Error(x);
> }
> return safeTrim(ba, bb.position(), isTrusted);
> }
More information about the core-libs-dev
mailing list