Request for review: 6896617: Optimize sun.nio.cs.ISO_8859_1$Encode.encodeArrayLoop() on x86
Ulf Zibis
Ulf.Zibis at CoSoCo.de
Fri Jan 11 18:19:04 UTC 2013
Hi Sherman,
Am 11.01.2013 06:47, schrieb Xueming Shen:
> (1) sp++ at Ln#159 probably can be moved into ln#155? the local sp here should
> not change the real "sp" after "break+return".
> (2) maybe the "overflowflag" can just be replaced by "CoderResult cr", with init value
> of CoderResult.UNDERFLOW;", then "cr = CoderResult.OVERFLOW" at ln#182, and
> simply "return cr" at ln#193, without another "if".
I've enhanced your suggestions, see more below...
Additionally, some part of encodeArrayLoop(...) maybe could be moved into a separate method, to be
reused by encode(char[] src, int sp, int len, byte[] dst).
Some of the re-engineering could be adapted to the Decoder methods.
> I'm surprised we only get 1.6% boost. Maybe it is worth measuring the performance
> of some "small" buf/array encoding? I'm a little concerned that the overhead may
> slow down the small size buf/array encoding. There is a simple benchmark for String
> en/decoding at test/sun/nio/cs/StrCodingBenchmark.
I think we should balance 4 cases in rating the performance:
a) few loops, small buf/array
b) few loops, big buf/array
c) many loops, small buf/array
d) many loops, big buf/array
In a), b) the loop surrounding code will not be JIT-compiled, so should be optimized for interpreter.
In c) d) the loop surrounding code *may be* JIT-compiled and consequtively inline the inner loop,
should be verified.
In b), d) the small inner loop, as demonstrated below, will be JIT-compiled, regardless if moved
into separate method.
-Ulf
1) Check for (sp >= sl) is superfluous.
======================================
private static int copyISOs(
char[] sa, int sp, byte[] da, int dp, int len) {
int i = 0;
for (; i < len; i++) {
char c = sa[sp++];
// if (c & '\uFF00' != 0) // needs bug 6935994 to be fixed, would be fastest
if ((byte)(c >> 8) != 0) // temporary replacement, fast byte-length operation on x86
break;
da[dp++] = (byte)c;
}
return i;
}
private CoderResult encodeArrayLoop(
CharBuffer src, ByteBuffer dst) {
char[] sa = src.array();
int soff = src.arrayOffset();
int sp = soff + src.position();
int sr = src.remaining();
int sl = sp + sr;
byte[] da = dst.array();
int doff = dst.arrayOffset();
int dp = doff + dst.position();
int dr = dst.remaining();
CoderResult cr;
if (dr < sr) {
sr = dr;
cr = CoderResult.OVERFLOW;
} else
cr = CoderResult.UNDERFLOW;
try {
int ret = copyISOs(sa, sp, da, dp, sr);
sp = sp + ret;
dp = dp + ret;
if (ret != sr) {
if (sgp.parse(sa[sp], sa, sp, sl) < 0)
return sgp.error();
return sgp.unmappableResult();
}
return cr;
} finally {
src.position(sp - soff);
dst.position(dp - doff);
}
}
2) Avoids sp, dp to be recalculated; shorter surrounding code -> best chance to become JIT-compiled.
======================================
// while inlinig, JIT will erase the surrounding int[] p
private static boolean copyISOs(
char[] sa, byte[] da, final int[] p, int sl) {
for (int sp = p[0], dp = p[1]; sp < sl; sp++) {
char c = sa[sp];
// if (c & '\uFF00' != 0) // needs bug 6935994 to be fixed, would be fastest
if ((byte)(c >> 8) != 0) // temporary replacement, fast byte-length operation on x86
return false;
da[dp++] = (byte)c;
}
return true;
}
private CoderResult encodeArrayLoop(
CharBuffer src, ByteBuffer dst) {
char[] sa = src.array();
int soff = src.arrayOffset();
int sp = soff + src.position();
int sr = src.remaining();
byte[] da = dst.array();
int doff = dst.arrayOffset();
int dp = doff + dst.position();
int dr = dst.remaining();
CoderResult cr;
if (dr < sr) {
sr = dr;
cr = CoderResult.OVERFLOW;
} else
cr = CoderResult.UNDERFLOW;
try {
int sl = sp + sr;
final int[] p = { sp, dp };
if (!copyISOs(sa, da, p, sl)) {
if (sgp.parse(sa[sp], sa, sp, sl) < 0)
return sgp.error();
return sgp.unmappableResult();
}
return cr;
} finally {
src.position(sp - soff);
dst.position(dp - doff);
}
}
3) No more needs try...finally block.
======================================
private CoderResult encodeArrayLoop(
CharBuffer src, ByteBuffer dst) {
char[] sa = src.array();
int soff = src.arrayOffset();
int sp = soff + src.position();
int sr = src.remaining();
byte[] da = dst.array();
int doff = dst.arrayOffset();
int dp = doff + dst.position();
int dr = dst.remaining();
CoderResult cr;
if (dr < sr) {
sr = dr;
cr = CoderResult.OVERFLOW;
} else
cr = CoderResult.UNDERFLOW;
int sl = sp + sr;
for (; sp < sl; sp++) {
char c = sa[sp];
// if (c & '\uFF00' != 0) { // needs bug 6935994 to be fixed, would be fastest
if ((byte)(c >> 8) != 0) { // temporary replacement, fast byte-length operation on x86
cr = null;
break;
}
da[dp++] = (byte)c;
}
src.position(sp - soff);
dst.position(dp - doff);
return result(cr, sa, sp, sl);
}
// if adapted, maybe could also be reused in encodeBufferLoop()
private static CoderResult result(CoderResult cr, byte[] sa, int sp, int sl) {
return cr != null ? cr :
sgp.parse(sa[sp], sa, sp, sl) < 0
? sgp.error();
: sgp.unmappableResult();
}
More information about the core-libs-dev
mailing list