Request for review: 6896617: Optimize sun.nio.cs.ISO_8859_1$Encode.encodeArrayLoop() on x86

Fri Jan 11 18:19:04 UTC 2013

Hi Sherman,

Am 11.01.2013 06:47, schrieb Xueming Shen:
> (1) sp++ at Ln#159 probably can be moved into ln#155? the local sp here should
>      not change the real "sp" after "break+return".
> (2) maybe the "overflowflag" can just be replaced by "CoderResult cr", with init value
>      of CoderResult.UNDERFLOW;",  then "cr = CoderResult.OVERFLOW" at ln#182, and
>      simply "return cr" at ln#193, without another "if".

I've enhanced your suggestions, see more below...
Additionally, some part of encodeArrayLoop(...) maybe could be moved into a separate method, to be 
reused by encode(char[] src, int sp, int len, byte[] dst).
Some of the re-engineering could be adapted to the Decoder methods.

> I'm surprised we only get 1.6% boost.  Maybe it is worth measuring the performance
> of some "small" buf/array encoding? I'm a little concerned that the overhead may
> slow down the small size buf/array encoding. There is a simple benchmark for String
> en/decoding at test/sun/nio/cs/StrCodingBenchmark.

I think we should balance 4 cases in rating the performance:
a) few loops, small buf/array
b) few loops, big buf/array
c) many loops, small buf/array
d) many loops, big buf/array
In a), b) the loop surrounding code will not be JIT-compiled, so should be optimized for interpreter.
In c) d) the loop surrounding code *may be* JIT-compiled and consequtively inline the inner loop, 
should be verified.
In b), d) the small inner loop, as demonstrated below, will be JIT-compiled, regardless if moved 
into separate method.

-Ulf

1) Check for (sp >= sl) is superfluous.
======================================
private static int copyISOs(
         char[] sa, int sp, byte[] da, int dp, int len) {
     int i = 0;
     for (; i < len; i++) {
         char c = sa[sp++];
         // if (c & '\uFF00' != 0) // needs bug 6935994 to be fixed, would be fastest
         if ((byte)(c >> 8) != 0) // temporary replacement, fast byte-length operation on x86
             break;
         da[dp++] = (byte)c;
     }
     return i;
}

private CoderResult encodeArrayLoop(
         CharBuffer src, ByteBuffer dst) {
     char[] sa = src.array();
     int soff = src.arrayOffset();
     int sp = soff + src.position();
     int sr = src.remaining();
     int sl = sp + sr;
     byte[] da = dst.array();
     int doff = dst.arrayOffset();
     int dp = doff + dst.position();
     int dr = dst.remaining();
     CoderResult cr;
     if (dr < sr) {
         sr = dr;
         cr = CoderResult.OVERFLOW;
     } else
         cr = CoderResult.UNDERFLOW;
     try {
         int ret = copyISOs(sa, sp, da, dp, sr);
         sp = sp + ret;
         dp = dp + ret;
         if (ret != sr) {
             if (sgp.parse(sa[sp], sa, sp, sl) < 0)
                 return sgp.error();
             return sgp.unmappableResult();
         }
         return cr;
     } finally {
         src.position(sp - soff);
         dst.position(dp - doff);
     }
}

2) Avoids sp, dp to be recalculated; shorter surrounding code -> best chance to become JIT-compiled.
======================================
// while inlinig, JIT will erase the surrounding int[] p
private static boolean copyISOs(
         char[] sa, byte[] da, final int[] p, int sl) {
     for (int sp = p[0], dp = p[1]; sp < sl; sp++) {
         char c = sa[sp];
         // if (c & '\uFF00' != 0) // needs bug 6935994 to be fixed, would be fastest
         if ((byte)(c >> 8) != 0) // temporary replacement, fast byte-length operation on x86
             return false;
         da[dp++] = (byte)c;
     }
     return true;
}

private CoderResult encodeArrayLoop(
         CharBuffer src, ByteBuffer dst) {
     char[] sa = src.array();
     int soff = src.arrayOffset();
     int sp = soff + src.position();
     int sr = src.remaining();
     byte[] da = dst.array();
     int doff = dst.arrayOffset();
     int dp = doff + dst.position();
     int dr = dst.remaining();
     CoderResult cr;
     if (dr < sr) {
         sr = dr;
         cr = CoderResult.OVERFLOW;
     } else
         cr = CoderResult.UNDERFLOW;
     try {
         int sl = sp + sr;
         final int[] p = { sp, dp };
         if (!copyISOs(sa, da, p, sl)) {
             if (sgp.parse(sa[sp], sa, sp, sl) < 0)
                 return sgp.error();
             return sgp.unmappableResult();
         }
         return cr;
     } finally {
         src.position(sp - soff);
         dst.position(dp - doff);
     }
}

3) No more needs try...finally block.
======================================
private CoderResult encodeArrayLoop(
         CharBuffer src, ByteBuffer dst) {
     char[] sa = src.array();
     int soff = src.arrayOffset();
     int sp = soff + src.position();
     int sr = src.remaining();
     byte[] da = dst.array();
     int doff = dst.arrayOffset();
     int dp = doff + dst.position();
     int dr = dst.remaining();
     CoderResult cr;
     if (dr < sr) {
         sr = dr;
         cr = CoderResult.OVERFLOW;
     } else
         cr = CoderResult.UNDERFLOW;
     int sl = sp + sr;
     for (; sp < sl; sp++) {
         char c = sa[sp];
         // if (c & '\uFF00' != 0) { // needs bug 6935994 to be fixed, would be fastest
         if ((byte)(c >> 8) != 0) { // temporary replacement, fast byte-length operation on x86
             cr = null;
             break;
         }
         da[dp++] = (byte)c;
     }
     src.position(sp - soff);
     dst.position(dp - doff);
     return result(cr, sa, sp, sl);
}

// if adapted, maybe could also be reused in encodeBufferLoop()
private static CoderResult result(CoderResult cr, byte[] sa, int sp, int sl) {
     return cr != null ? cr :
         sgp.parse(sa[sp], sa, sp, sl) < 0
             ? sgp.error();
             : sgp.unmappableResult();
}