Enhancements to the java.lang.StringCoding class

Mon Jul 3 22:54:25 UTC 2017

As Martin pointed out, the UTF-16 variants probably are not critical 
enough to have "special"
fastpath implementation in StringCoding. If better performance of these 
UTF-16 charsets is
really desired, it might be worth implementing the 
sun.nio.ArrayDe/Encoder interface in
UnicodeDe/Encoder to speed up the String/char[]/byte[] de/encoding. Most 
frequent-used
charsets in jdk repository now have this fastpath enabled.

Martin, what's the real use case that forces you to rewrite the charset 
implementation outside
the jdk? To add back convenient de/encoding methods to work with 
byte[]/char[] directly?
(to add public interface/methods similar to sun.nio.ArrayDe/Encoder, for 
example)

Convenient methods doing de/encoding with ByteBuffer/CharBuffer 
Gathering/Scattering
might be something we want to do for 10 ...

-Sherman

On 7/2/17, 4:05 PM, Martin Buchholz wrote:
> Very high level:
>
> UTF-16 is not expected to be a popular encoding for text outside the JDK.
> Everyone is supposed to be migrating to UTF-8 from ISO-8859-1 and other
> legacy encodings.
>
> The fact that people (like you and I) are writing specialized
> encoders/decoders outside of the "real" charset implementations for better
> performance suggests that the nio charset API could be rethought.
>
> On Sun, Jul 2, 2017 at 12:22 PM, John Platts<john_platts at hotmail.com>
> wrote:
>
>> I was looking at the OpenJDK 9 code, and I noticed that optimizations for
>> encoding and decoding from UTF-16 text could be added to the
>> java.lang.StringCoding class.
>>
>> Here is how the optimized UTF-16 decoding could be implemented in
>> java.lang.StringCoding:
>> private static void byteSwapUTF16(byte[] arr, int start) {
>>      for(int i = start; i<  arr.length; i += 2) {
>>          byte b1 = arr[i];
>>          byte b2 = arr[i + 1];
>>
>>
>>          arr[i] = b2;
>>          arr[i + 1] = b1;
>>      }
>> }
>>
>> static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) {
>>      byte[] result;
>>
>>      if(coder == LATIN1) {
>>          result = new byte[(val.length + (includeBOM ? 1 : 0))<<  1];
>>          int resultStartOffset = includeBOM ? 2 : 0;
>>
>>          if(includeBOM) {
>>              result[0] = (byte)0xFE;
>>              result[1] = (byte)0xFF;
>>          }
>>
>>          for(int i = 0; i<  val.length; i++) {
>>              result[resultStartOffset + (i<<  1) + 1] = val[i];
>>          }
>>      } else {
>>          result = new byte[val.length + (includeBOM ? 2 : 0)];
>>          int resultStartOffset = includeBOM ? 2 : 0;
>>
>>          if(includeBOM) {
>>              result[0] = (byte)0xFE;
>>              result[1] = (byte)0xFF;
>>          }
>>
>>          System.arraycopy(val, 0, result, resultStartOffset, val.length);
>>
>>          if(StringUTF16.HI_BYTE_SHIFT == 0) {
>>              // val is encoded using little-endian UTF-16
>>              // Convert to big-endian UTF-16 from little-endian UTF-16
>>              byteSwapUTF16(result, resultStartOffset);
>>          }
>>
>>          for(int i = resultStartOffset; i<  result.length; i += 2) {
>>              int b1 = Byte.toUnsignedInt(result[i]);
>>              int b3 = result.length - i>= 4 ? Byte.toUnsignedInt(result[i
>> + 2]) : -1;
>>              if(b1>= 0xD8&&  b1<= 0xDF) {
>>                  if(b1<= 0xDB&&  b3>= 0xDC&&  b3<= 0xDF) {
>>                      // UTF-16 surrogate pair encountered
>>
>>                      // Advance i to the position of the low surrogate
>>                      i += 2;
>>
>>                      // Continue the loop past the low surrogate
>>                      continue;
>>                  }
>>
>>                  // Unpaired surrogate character encountered
>>                  // Replace unpaired surrogate character with U+FFFD
>>                  result[i] = (byte)0xFF;
>>                  result[i + 1] = (byte)0xFD;
>>              }
>>          }
>>      }
>>
>>      return result;
>> }
>>
>> static byte[] encodeUTF16LE(byte coder, byte[] val) {
>>      byte[] result;
>>
>>      if(coder == LATIN1) {
>>          result = new byte[val.length<<  1];
>>
>>          for(int i = 0; i<  val.length; i++) {
>>              result[i<<  1] = val[i];
>>          }
>>      } else {
>>          result = val.clone();
>>
>>          if(StringUTF16.LO_BYTE_SHIFT == 0) {
>>              // val is encoded using big-endian UTF-16
>>
>>              // Convert result to little-endian UTF-16 from big-endian
>> UTF-16 by byte swapping
>>              byteSwapUTF16(result, 0);
>>          }
>>
>>          for(int i = 0; i<  result.length; i += 2) {
>>              int b2 = Byte.toUnsignedInt(result[i + 1]);
>>              int b4 = result.length - i>= 4 ? Byte.toUnsignedInt(result[i
>> + 3]) : -1;
>>              if(b2>= 0xD8&&  b2<= 0xDF) {
>>                  if(b2<= 0xDB&&  b4>= 0xDC&&  b4<= 0xDF) {
>>                      // UTF-16 surrogate pair encountered
>>
>>                      // Advance i to the position of the low surrogate
>>                      i += 2;
>>
>>                      // Continue the loop past the low surrogate
>>                      continue;
>>                  }
>>
>>                  // Unpaired surrogate character encountered
>>                  // Replace unpaired surrogate character with U+FFFD
>>                  result[i] = (byte)0xFD;
>>                  result[i + 1] = (byte)0xFF;
>>              }
>>          }
>>      }
>>
>>      return result;
>> }
>>
>> static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) {
>>      boolean bigEndian = true;
>>
>>      if(len>= 2) {
>>          int b1 = Byte.toUnsignedInt(ba[off]);
>>          int b2 = Byte.toUnsignedInt(ba[off + 1]);
>>          if(b1 == 0xFE&&  b2 == 0xFF) {
>>              // Big-endian BOM detected
>>              off += 2;
>>              len -= 2;
>>          } else if(b1 == 0xFF&&  b2 == 0xFE) {
>>              // Little-endian BOM detected
>>              off += 2;
>>              len -= 2;
>>              bigEndian = false;
>>          }
>>      }
>>
>>      return decodeUTF16(ba, off, len, bigEndian);
>> }
>>
>>
>> static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) {
>>      Result result = new Result();
>>
>>      if(len == 0) {
>>          return result.with();
>>      }
>>
>>      byte[] decodedArr;
>>      if(COMPACT_STRINGS&&  (len&  1) == 0) {
>>          // Check for non-Latin1 characters
>>          boolean containsNonLatin1 = false;
>>          for(int i = 0; i<  len; i += 2) {
>>              if(ba[off + i + (bigEndian ? 0 : 1)] != 0) {
>>                  containsNonLatin1 = true;
>>                  break;
>>              }
>>          }
>>
>>          // If the input only contains Latin1 characters, copy the source
>> characters
>>          // to a Latin1-encoded byte array, and return the decoded text.
>>          if(!containsNonLatin1) {
>>              decodedArr = new byte[len>>  1];
>>
>>              for(int i = 0; i<  decodedArr.length; i++) {
>>                  decodedArr[i] = ba[off + (i<<  1) + (bigEndian ? 1 : 0)];
>>              }
>>
>>              return result.with(decodedArr, LATIN1);
>>          }
>>      }
>>
>>      decodedArr = new byte[len + (len&  1)];
>>      System.arraycopy(ba, off, decodedArr, 0, len);
>>
>>      if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) {
>>          // Input byte order does not match system byte order
>>
>>          // Byte swap decodedArr so that decodedArr is in system byte order
>>          byteSwapUTF16(decodedArr, 0);
>>      }
>>
>>      // decodedArr is now in system byte order
>>
>>      if((len&  1) != 0) {
>>          // If len is odd, then there is a malformed character at the end.
>>
>>          // Replace the last character in decodedArr with U+FFFD if this is
>> the case.
>>          StringUTF16.putChar(decodedArr, (decodedArr.length>>  1) - 1,
>> 0xFFFD);
>>
>>          // Decrement len by 1 to make len even.
>>          len--;
>>      }
>>
>>      // len is now even
>>
>>      // charLen is equal to the number of UTF-16 characters in decodedArr
>>      int charLen = len>>  1;
>>
>>      // replace the reversed BOM and unpaired surrogates with U+FFFD
>>      for(int i = 0; i<  charLen; i++) {
>>          char ch = StringUTF16.getChar(decodedArr, i);
>>
>>          if(charLen - i>= 2&&
>>              Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr,
>> i + 1)) {
>>              // Surrogate pair detected
>>
>>              // Increment i to the position of the low surrogate
>>              i++;
>>
>>              // Continue the loop
>>              continue;
>>          }
>>
>>          if(ch == (char)0xFFFE || Character.isSurrogate(ch)) {
>>              // Reversed BOM or unpaired surrogate encountered
>>
>>              // Replace ch with 0xFFFD
>>              StringUTF16.putChar(decodedArr, i, (char)0xFFFD);
>>          }
>>      }
>>
>>      // If compact strings are enabled, return a Latin1-encoded result if
>> the result
>>      // does not contain any non-Latin-1 characters.
>>      if(COMPACT_STRINGS) {
>>          byte[] compressedArr = StringUTF16.compress(decodedArr, 0,
>> decodedArr.len);
>>          if(compressedArr != null) {
>>              return result.with(compressedArr, LATIN1);
>>          }
>>      }
>>
>>      return result.with(decodedArr, UTF16);
>> }
>>
>> private static class StringDecoderUTF_16 extends StringDecoder {
>>          StringDecoderUTF_16(Charset cs, String rcn) {
>>              super(cs, rcn);
>>          }
>>          Result decode(byte[] ba, int off, int len) {
>>              return bomDetectDecodeUTF16(ba, off, len);
>>          }
>> }
>>
>> private static class StringDecoderUTF_16LE extends StringDecoder {
>>          StringDecoderUTF_16(Charset cs, String rcn) {
>>              super(cs, rcn);
>>          }
>>          Result decode(byte[] ba, int off, int len) {
>>              return decodeUTF16(ba, off, len, false);
>>          }
>> }
>>
>> private static class StringDecoderUTF_16BE extends StringDecoder {
>>          StringDecoderUTF_16(Charset cs, String rcn) {
>>              super(cs, rcn);
>>          }
>>          Result decode(byte[] ba, int off, int len) {
>>              return decodeUTF16(ba, off, len, true);
>>          }
>> }
>>
>> static Result decode(String charsetName, byte[] ba, int off, int len)
>>          throws UnsupportedEncodingException
>>      {
>>          StringDecoder sd = deref(decoder);
>>          String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
>>          if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
>>                                || csn.equals(sd.charsetName()))) {
>>              sd = null;
>>              try {
>>                  Charset cs = lookupCharset(csn);
>>                  if (cs != null) {
>>                      if (cs == UTF_8) {
>>                          sd = new StringDecoderUTF8(cs, csn);
>>                      } else if (cs == ISO_8859_1) {
>>                          sd = new StringDecoder8859_1(cs, csn);
>>                      } else if(cs == StandardCharsets.UTF_16) {
>>                          sd = new StringDecoderUTF_16(cs, csn);
>>                      } else if(cs == StandardCharsets.UTF_16LE) {
>>                          sd = new StringDecoderUTF_16LE(cs, csn);
>>                      } else if(cs == StandardCharsets.UTF_16BE) {
>>                          sd = new StringDecoderUTF_16BE(cs, csn);
>>                      } else {
>>                          sd = new StringDecoder(cs, csn);
>>                      }
>>                  }
>>              } catch (IllegalCharsetNameException x) {}
>>              if (sd == null)
>>                  throw new UnsupportedEncodingException(csn);
>>              set(decoder, sd);
>>          }
>>          return sd.decode(ba, off, len);
>>      }
>> }
>>
>> static byte[] encode(Charset cs, byte coder, byte[] val) {
>>          if (cs == UTF_8) {
>>              return encodeUTF8(coder, val);
>>          } else if (cs == ISO_8859_1) {
>>              return encode8859_1(coder, val);
>>          } else if (cs == US_ASCII) {
>>              return encodeASCII(coder, val);
>>          } else if (cs == StandardCharsets.UTF_16 || cs ==
>> StandardCharsets.UTF_16BE){
>>              return encodeUTF16BE(coder, val, cs ==
>> StandardCharsets.UTF_16);
>>          } else if (cs == StandardCharsets.UTF_16LE) {
>>              return encodeUTF16LE(coder, val);
>>          }
>>          CharsetEncoder ce = cs.newEncoder();
>>          // fastpath for ascii compatible
>>          if (coder == LATIN1&&  (((ce instanceof ArrayEncoder)&&
>>                                   ((ArrayEncoder)ce).isASCIICompatible()&&
>>                                   !hasNegatives(val, 0, val.length)))) {
>>              return Arrays.copyOf(val, val.length);
>>          }
>>          int len = val.length>>  coder;  // assume LATIN1=0/UTF16=1;
>>          int en = scale(len, ce.maxBytesPerChar());
>>          byte[] ba = new byte[en];
>>          if (len == 0) {
>>              return ba;
>>          }
>>          boolean isTrusted = System.getSecurityManager() == null ||
>>                              cs.getClass().getClassLoader0() == null;
>>          ce.onMalformedInput(CodingErrorAction.REPLACE)
>>            .onUnmappableCharacter(CodingErrorAction.REPLACE)
>>            .reset();
>>          if (ce instanceof ArrayEncoder) {
>>              if (!isTrusted) {
>>                  val = Arrays.copyOf(val, val.length);
>>              }
>>              int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val,
>> 0, len, ba)
>>                                            : ((ArrayEncoder)ce).encodeFromUTF16(val,
>> 0, len, ba);
>>              if (blen != -1) {
>>                  return safeTrim(ba, blen, isTrusted);
>>              }
>>          }
>>          char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
>>                                         : StringUTF16.toChars(val);
>>          ByteBuffer bb = ByteBuffer.wrap(ba);
>>          CharBuffer cb = CharBuffer.wrap(ca, 0, len);
>>          try {
>>              CoderResult cr = ce.encode(cb, bb, true);
>>              if (!cr.isUnderflow())
>>                  cr.throwException();
>>              cr = ce.flush(bb);
>>              if (!cr.isUnderflow())
>>                  cr.throwException();
>>          } catch (CharacterCodingException x) {
>>              throw new Error(x);
>>          }
>>          return safeTrim(ba, bb.position(), isTrusted);
>> }