[aarch64-port-dev ] RFR: JDK8: Add support for CRC32 intrinsic
Andrew Haley
aph at redhat.com
Sat May 24 10:39:35 UTC 2014
Hi,
On 05/23/2014 11:02 AM, Edward Nevill wrote:
> The following patch adds support for CRC32 intrinsic.
>
> The patch is a non neon patch. IE. It uses only the base aarch64 instruction set. Patch for neon to follow.
>
> Even without neon it gets 4.5 x improvement on my test case
>
> http://people.linaro.org/~edward.nevill/crc32/CRCTest.java
>
> As the patch is quite big (38K) I have put a copy of the patch @
>
> http://people.linaro.org/~edward.nevill/crc32/crc32.patch
>
> which may be easier to apply if anyone wishes to try this out.
>
> The algorithm uses 4 x tables and handles 16 bytes (1 ldp worth) per iteration. I experimented doing 32 bytes per loop but I could not measure the difference so I left it at 16. There are also algorithms that use 8 tables (Google slice by 8) but I think the returns from this over the simpler by 4 algorithm are minimal.
Basically OK, some comments inline.
I wonder about the decision to generate interpreter and C1 versions of
intrinsics like this. I would have thought that the advantage of C1
code is small, but if we had a client-only VM it would make sense.
It's still OK to commit, though.
Andrew.
> +#endif // CPU_AARCH64_VM_STUBROUTINES_AARCH64_HPP
> diff -r 9d3bc0f40cce -r 60fac40265fc src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
> --- a/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp Wed May 14 15:43:50 2014 +0100
> +++ b/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp Fri May 23 10:47:15 2014 +0100
> @@ -673,6 +673,126 @@
> return NULL;
> }
>
> +/**
> + * Method entry for static native methods:
> + * int java.util.zip.CRC32.update(int crc, int b)
> + */
> +address InterpreterGenerator::generate_CRC32_update_entry() {
> + if (UseCRC32Intrinsics) {
> + address entry = __ pc();
> +
> + // rmethod: Method*
> + // r13: senderSP must preserved for slow path, set SP to it on fast path
This comment is wrong. SP does not get set to r13 on fast path. It
must be preserved for slow path, that's true.
> + // esp: args
> +
> + Label slow_path;
> + // If we need a safepoint check, generate full interpreter entry.
> + ExternalAddress state(SafepointSynchronize::address_of_state());
> + unsigned long offset;
> + __ adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
> + __ ldrw(rscratch1, Address(rscratch1, offset));
> + assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
> + __ cbnz(rscratch1, slow_path);
> +
> + // We don't generate local frame and don't align stack because
> + // we call stub code and there is no safepoint on this path.
> +
> + // Load parameters
> + const Register crc = c_rarg0; // crc
> + const Register val = c_rarg1; // source java byte value
> + const Register tbl = c_rarg2; // scratch
> +
> + // Arguments are reversed on java expression stack
> + __ ldrw(val, Address(esp, 0)); // byte value
> + __ ldrw(crc, Address(esp, wordSize)); // Initial CRC
> +
> + __ adrp(tbl, ExternalAddress(StubRoutines::crc_table_addr()), offset);
> + __ add(tbl, tbl, offset);
> +
> + __ ornw(crc, zr, crc); // ~crc
> + __ update_byte_crc32(crc, val, tbl);
> + __ ornw(crc, zr, crc); // ~crc
> +
> + // result in c_rarg0
> +
> + // _areturn
> + // __ mov(sp, r13); // set sp to sender sp
Please remove these two commented lines.
> + __ ret(lr);
> +
> + // generate a vanilla native entry as the slow path
> + __ bind(slow_path);
> +
> + (void) generate_native_entry(false);
> +
> + return entry;
> + }
> + return generate_native_entry(false);
> +}
> +
> +/**
> + * Method entry for static native methods:
> + * int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
> + * int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
> + */
> +address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
> + if (UseCRC32Intrinsics) {
> + address entry = __ pc();
> +
> + // rbx,: Method*
> + // r13: senderSP must preserved for slow path, set SP to it on fast path
The comments are wrong.
> +
> + Label slow_path;
> + // If we need a safepoint check, generate full interpreter entry.
> + ExternalAddress state(SafepointSynchronize::address_of_state());
> + unsigned long offset;
> + __ adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
> + __ ldrw(rscratch1, Address(rscratch1, offset));
> + assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
> + __ cbnz(rscratch1, slow_path);
> +
> + // We don't generate local frame and don't align stack because
> + // we call stub code and there is no safepoint on this path.
> +
> + // Load parameters
> + const Register crc = c_rarg0; // crc
> + const Register buf = c_rarg1; // source java byte array address
> + const Register len = c_rarg2; // length
> + const Register off = len; // offset (never overlaps with 'len')
> +
> + // Arguments are reversed on java expression stack
> + // Calculate address of start element
> + if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
> + __ ldr(buf, Address(esp, 2*wordSize)); // long buf
> + __ ldrw(off, Address(esp, wordSize)); // offset
> + __ add(buf, buf, off); // + offset
> + __ ldrw(crc, Address(esp, 4*wordSize)); // Initial CRC
> + } else {
> + __ ldr(buf, Address(esp, 2*wordSize)); // byte[] array
> + __ add(buf, buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
> + __ ldrw(off, Address(esp, wordSize)); // offset
> + __ add(buf, buf, off); // + offset
> + __ ldrw(crc, Address(esp, 3*wordSize)); // Initial CRC
> + }
> + // Can now load 'len' since we're finished with 'off'
> + __ ldrw(len, Address(esp, 0x0)); // Length
> +
> + __ mov(rscratch1, lr); // saved by call_VM_leaf
> + __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len);
> +
> + // _areturn
> + // __ mov(sp, r13); // set sp to sender sp
> + __ ret(rscratch1);
> +
> + // generate a vanilla native entry as the slow path
> + __ bind(slow_path);
> +
> + (void) generate_native_entry(false);
> +
> + return entry;
> + }
> + return generate_native_entry(false);
> +}
> +
> void InterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
> // Bang each page in the shadow zone. We can't assume it's been done for
> // an interpreter frame with greater than a page of locals, so each page
> @@ -1373,6 +1493,12 @@
> case Interpreter::java_lang_math_exp : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind); break;
> case Interpreter::java_lang_ref_reference_get
> : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
> + case Interpreter::java_util_zip_CRC32_update
> + : entry_point = ((InterpreterGenerator*)this)->generate_CRC32_update_entry(); break;
> + case Interpreter::java_util_zip_CRC32_updateBytes
> + : // fall thru
> + case Interpreter::java_util_zip_CRC32_updateByteBuffer
> + : entry_point = ((InterpreterGenerator*)this)->generate_CRC32_updateBytes_entry(kind); break;
> default : ShouldNotReachHere(); break;
> }
>
> diff -r 9d3bc0f40cce -r 60fac40265fc src/cpu/aarch64/vm/vm_version_aarch64.cpp
> --- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp Wed May 14 15:43:50 2014 +0100
> +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp Fri May 23 10:47:15 2014 +0100
> @@ -91,6 +91,10 @@
> FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
> FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
> FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
> +
> + if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
> + UseCRC32Intrinsics = true;
> + }
> }
>
> void VM_Version::initialize() {
> --- CUT HERE ---
>
>
More information about the aarch64-port-dev
mailing list