Enhancements to the java.lang.StringCoding class
John Platts
john_platts at hotmail.com
Sun Jul 2 19:22:49 UTC 2017
I was looking at the OpenJDK 9 code, and I noticed that optimizations for encoding and decoding from UTF-16 text could be added to the java.lang.StringCoding class.
Here is how the optimized UTF-16 decoding could be implemented in java.lang.StringCoding:
private static void byteSwapUTF16(byte[] arr, int start) {
for(int i = start; i < arr.length; i += 2) {
byte b1 = arr[i];
byte b2 = arr[i + 1];
arr[i] = b2;
arr[i + 1] = b1;
}
}
static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) {
byte[] result;
if(coder == LATIN1) {
result = new byte[(val.length + (includeBOM ? 1 : 0)) << 1];
int resultStartOffset = includeBOM ? 2 : 0;
if(includeBOM) {
result[0] = (byte)0xFE;
result[1] = (byte)0xFF;
}
for(int i = 0; i < val.length; i++) {
result[resultStartOffset + (i << 1) + 1] = val[i];
}
} else {
result = new byte[val.length + (includeBOM ? 2 : 0)];
int resultStartOffset = includeBOM ? 2 : 0;
if(includeBOM) {
result[0] = (byte)0xFE;
result[1] = (byte)0xFF;
}
System.arraycopy(val, 0, result, resultStartOffset, val.length);
if(StringUTF16.HI_BYTE_SHIFT == 0) {
// val is encoded using little-endian UTF-16
// Convert to big-endian UTF-16 from little-endian UTF-16
byteSwapUTF16(result, resultStartOffset);
}
for(int i = resultStartOffset; i < result.length; i += 2) {
int b1 = Byte.toUnsignedInt(result[i]);
int b3 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i + 2]) : -1;
if(b1 >= 0xD8 && b1 <= 0xDF) {
if(b1 <= 0xDB && b3 >= 0xDC && b3 <= 0xDF) {
// UTF-16 surrogate pair encountered
// Advance i to the position of the low surrogate
i += 2;
// Continue the loop past the low surrogate
continue;
}
// Unpaired surrogate character encountered
// Replace unpaired surrogate character with U+FFFD
result[i] = (byte)0xFF;
result[i + 1] = (byte)0xFD;
}
}
}
return result;
}
static byte[] encodeUTF16LE(byte coder, byte[] val) {
byte[] result;
if(coder == LATIN1) {
result = new byte[val.length << 1];
for(int i = 0; i < val.length; i++) {
result[i << 1] = val[i];
}
} else {
result = val.clone();
if(StringUTF16.LO_BYTE_SHIFT == 0) {
// val is encoded using big-endian UTF-16
// Convert result to little-endian UTF-16 from big-endian UTF-16 by byte swapping
byteSwapUTF16(result, 0);
}
for(int i = 0; i < result.length; i += 2) {
int b2 = Byte.toUnsignedInt(result[i + 1]);
int b4 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i + 3]) : -1;
if(b2 >= 0xD8 && b2 <= 0xDF) {
if(b2 <= 0xDB && b4 >= 0xDC && b4 <= 0xDF) {
// UTF-16 surrogate pair encountered
// Advance i to the position of the low surrogate
i += 2;
// Continue the loop past the low surrogate
continue;
}
// Unpaired surrogate character encountered
// Replace unpaired surrogate character with U+FFFD
result[i] = (byte)0xFD;
result[i + 1] = (byte)0xFF;
}
}
}
return result;
}
static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) {
boolean bigEndian = true;
if(len >= 2) {
int b1 = Byte.toUnsignedInt(ba[off]);
int b2 = Byte.toUnsignedInt(ba[off + 1]);
if(b1 == 0xFE && b2 == 0xFF) {
// Big-endian BOM detected
off += 2;
len -= 2;
} else if(b1 == 0xFF && b2 == 0xFE) {
// Little-endian BOM detected
off += 2;
len -= 2;
bigEndian = false;
}
}
return decodeUTF16(ba, off, len, bigEndian);
}
static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) {
Result result = new Result();
if(len == 0) {
return result.with();
}
byte[] decodedArr;
if(COMPACT_STRINGS && (len & 1) == 0) {
// Check for non-Latin1 characters
boolean containsNonLatin1 = false;
for(int i = 0; i < len; i += 2) {
if(ba[off + i + (bigEndian ? 0 : 1)] != 0) {
containsNonLatin1 = true;
break;
}
}
// If the input only contains Latin1 characters, copy the source characters
// to a Latin1-encoded byte array, and return the decoded text.
if(!containsNonLatin1) {
decodedArr = new byte[len >> 1];
for(int i = 0; i < decodedArr.length; i++) {
decodedArr[i] = ba[off + (i << 1) + (bigEndian ? 1 : 0)];
}
return result.with(decodedArr, LATIN1);
}
}
decodedArr = new byte[len + (len & 1)];
System.arraycopy(ba, off, decodedArr, 0, len);
if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) {
// Input byte order does not match system byte order
// Byte swap decodedArr so that decodedArr is in system byte order
byteSwapUTF16(decodedArr, 0);
}
// decodedArr is now in system byte order
if((len & 1) != 0) {
// If len is odd, then there is a malformed character at the end.
// Replace the last character in decodedArr with U+FFFD if this is the case.
StringUTF16.putChar(decodedArr, (decodedArr.length >> 1) - 1, 0xFFFD);
// Decrement len by 1 to make len even.
len--;
}
// len is now even
// charLen is equal to the number of UTF-16 characters in decodedArr
int charLen = len >> 1;
// replace the reversed BOM and unpaired surrogates with U+FFFD
for(int i = 0; i < charLen; i++) {
char ch = StringUTF16.getChar(decodedArr, i);
if(charLen - i >= 2 &&
Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr, i + 1)) {
// Surrogate pair detected
// Increment i to the position of the low surrogate
i++;
// Continue the loop
continue;
}
if(ch == (char)0xFFFE || Character.isSurrogate(ch)) {
// Reversed BOM or unpaired surrogate encountered
// Replace ch with 0xFFFD
StringUTF16.putChar(decodedArr, i, (char)0xFFFD);
}
}
// If compact strings are enabled, return a Latin1-encoded result if the result
// does not contain any non-Latin-1 characters.
if(COMPACT_STRINGS) {
byte[] compressedArr = StringUTF16.compress(decodedArr, 0, decodedArr.len);
if(compressedArr != null) {
return result.with(compressedArr, LATIN1);
}
}
return result.with(decodedArr, UTF16);
}
private static class StringDecoderUTF_16 extends StringDecoder {
StringDecoderUTF_16(Charset cs, String rcn) {
super(cs, rcn);
}
Result decode(byte[] ba, int off, int len) {
return bomDetectDecodeUTF16(ba, off, len);
}
}
private static class StringDecoderUTF_16LE extends StringDecoder {
StringDecoderUTF_16(Charset cs, String rcn) {
super(cs, rcn);
}
Result decode(byte[] ba, int off, int len) {
return decodeUTF16(ba, off, len, false);
}
}
private static class StringDecoderUTF_16BE extends StringDecoder {
StringDecoderUTF_16(Charset cs, String rcn) {
super(cs, rcn);
}
Result decode(byte[] ba, int off, int len) {
return decodeUTF16(ba, off, len, true);
}
}
static Result decode(String charsetName, byte[] ba, int off, int len)
throws UnsupportedEncodingException
{
StringDecoder sd = deref(decoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
|| csn.equals(sd.charsetName()))) {
sd = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null) {
if (cs == UTF_8) {
sd = new StringDecoderUTF8(cs, csn);
} else if (cs == ISO_8859_1) {
sd = new StringDecoder8859_1(cs, csn);
} else if(cs == StandardCharsets.UTF_16) {
sd = new StringDecoderUTF_16(cs, csn);
} else if(cs == StandardCharsets.UTF_16LE) {
sd = new StringDecoderUTF_16LE(cs, csn);
} else if(cs == StandardCharsets.UTF_16BE) {
sd = new StringDecoderUTF_16BE(cs, csn);
} else {
sd = new StringDecoder(cs, csn);
}
}
} catch (IllegalCharsetNameException x) {}
if (sd == null)
throw new UnsupportedEncodingException(csn);
set(decoder, sd);
}
return sd.decode(ba, off, len);
}
}
static byte[] encode(Charset cs, byte coder, byte[] val) {
if (cs == UTF_8) {
return encodeUTF8(coder, val);
} else if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
} else if (cs == US_ASCII) {
return encodeASCII(coder, val);
} else if (cs == StandardCharsets.UTF_16 || cs == StandardCharsets.UTF_16BE){
return encodeUTF16BE(coder, val, cs == StandardCharsets.UTF_16);
} else if (cs == StandardCharsets.UTF_16LE) {
return encodeUTF16LE(coder, val);
}
CharsetEncoder ce = cs.newEncoder();
// fastpath for ascii compatible
if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
((ArrayEncoder)ce).isASCIICompatible() &&
!hasNegatives(val, 0, val.length)))) {
return Arrays.copyOf(val, val.length);
}
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0) {
return ba;
}
boolean isTrusted = System.getSecurityManager() == null ||
cs.getClass().getClassLoader0() == null;
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (ce instanceof ArrayEncoder) {
if (!isTrusted) {
val = Arrays.copyOf(val, val.length);
}
int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
: ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
if (blen != -1) {
return safeTrim(ba, blen, isTrusted);
}
}
char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
: StringUTF16.toChars(val);
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return safeTrim(ba, bb.position(), isTrusted);
}
More information about the core-libs-dev
mailing list