[aarch64-port-dev ] Whether or not to revert of checkcast_arraycopy stub to generate return value expected by generic C2 code
Andrew Dinn
adinn at redhat.com
Tue Oct 15 07:12:56 PDT 2013
The AArch64 checkcast_arraycopy stub has been implemented to employ a
different semantics for the return value to that employed by x86 and
this is running afoul of the expectations built into the generis layer
of C2.
Specifically, on x86 the return value is
0 : if all values were copied
-1^K : if a partial copy of K values occurred -- equivalently, -(k+1)
On aarch64 the return value is
0 : if all values were copied
K : if there are K values uncopied
(yes that first rule is really redundant)
The problem is that the generic code in the C2 compiler implements the
stub by generating ideal code which assumes the former return value
semantics i.e. it constructs a test node
(If (Cmp res zero) cr)
to test the return value and then in the false branch generates
(Set copied (XorI res -1))
and uses the result to feed some arithmetic nodes
(SubI length copied)
(AddI srcpos copied)
(AddI dstpos copied)
which it then passes into a runtime call
(CallRuntime "OptoRuntime::slow_arraycopy_Type")
So, we have two options here: revert the stub so it returns 0 or -1^K or
edit the generic layer code to include an AArch64-specific compilation
path which expects the semantics adopted for C1.
I have implemented the changes needed to revert the behaviour of the
stub and the diff is attached below.
Changing the generic code is not complex. It requires planting the same
branch
(If (Cmp res zero) cr)
replacing the XorINode with
(set copied (SubI length res))
and feeding this into the same 3 arithmetic nodes
(SubI length copied)
(AddI srcpos copied)
(AddI dstpos copied)
The issue is whether we want to introduce an AARCH64 conditional
compilation path into the generic layer. If we avoid that and retain the
original semantics then that adds a tad more overhead when running the
stub (but not much). The difference in the calling code is trivial and
zero-cost in both the C1 and C2 case (change an eonw for either a mov or
a sub).
regards,
Andrew Dinn
-----------
diff -r 75997cf311bb src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp Tue Oct 15 14:16:04
2013 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp Tue Oct 15 14:30:27
2013 +0100
@@ -2223,10 +2223,12 @@
__ ldr(src, Address(sp, 4*BytesPerWord));
if (copyfunc_addr != NULL) {
- __ subw(rscratch1, length, r0); // Number of oops actually copied
+ // r0 is -1^K where K == partial copied count
+ __ eonw(rscratch1, r0, 0);
+ // adjust length down and src/end pos up by partial copied count
+ __ subw(length, length, rscratch1);
__ addw(src_pos, src_pos, rscratch1);
__ addw(dst_pos, dst_pos, rscratch1);
- __ mov(length, r0); // Number of oops left to copy
}
__ b(*stub->entry());
@@ -2401,10 +2403,12 @@
__ ldp(length, src_pos, Address(sp, 2*BytesPerWord));
__ ldr(src, Address(sp, 4*BytesPerWord));
- __ subw(rscratch1, length, r0); // Number of oops actually copied
+ // return value is -1^K where K is partial copied count
+ __ eonw(rscratch1, r0, zr);
+ // adjust length down and src/end pos up by partial copied count
+ __ subw(length, length, rscratch1);
__ addw(src_pos, src_pos, rscratch1);
__ addw(dst_pos, dst_pos, rscratch1);
- __ mov(length, r0); // Number of oops left to copy
}
__ b(*stub->entry());
diff -r 75997cf311bb src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Oct 15 14:16:04
2013 +0100
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Oct 15 14:30:27
2013 +0100
@@ -1447,21 +1447,23 @@
// c_rarg4 - oop ckval (super_klass)
//
// Output:
- // r0 - count of oops remaining to copy
+ // r0 == 0 - success
+ // r0 == -1^K - failure, where K is partial transfer count
//
address generate_checkcast_copy(const char *name, address *entry,
bool dest_uninitialized = false) {
- Label L_load_element, L_store_element, L_do_card_marks, L_done;
+ Label L_load_element, L_store_element, L_do_card_marks, L_setup,
L_cleanup, L_done;
// Input registers (after setup_arg_regs)
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
- const Register count = c_rarg2; // elementscount
+ const Register count_orig = c_rarg2; // orig elements count
const Register ckoff = c_rarg3; // super_check_offset
const Register ckval = c_rarg4; // super_klass
// Registers used as temps (r18, r19, r20 are save-on-entry)
+ const Register count = r21; // loop counter
const Register start_to = r20; // destination array start
address
const Register copied_oop = r18; // actual oop copied
const Register r19_klass = r19; // oop._klass
@@ -1473,8 +1475,8 @@
// of the source type. Each element must be separately
// checked.
- assert_different_registers(from, to, count, ckoff, ckval, start_to,
- copied_oop, r19_klass);
+ assert_different_registers(from, to, count_orig, ckoff, ckval,
start_to,
+ copied_oop, r19_klass, count);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
@@ -1498,10 +1500,13 @@
BLOCK_COMMENT("Entry:");
}
- // Empty array: Nothing to do.
- __ cbz(count, L_done);
+ __ cbnz(count_orig, L_setup);
- __ push(r18->bit() | r19->bit() | r20->bit(), sp);
+ __ mov(r0, zr);
+ __ b(L_done);
+
+ __ bind(L_setup);
+ __ push(r18->bit() | r19->bit() | r20->bit() | r21->bit(), sp);
#ifdef ASSERT
BLOCK_COMMENT("assert consistent ckoff/ckval");
@@ -1517,6 +1522,9 @@
}
#endif //ASSERT
+ // the loop counts up from -count_orig to zero
+ __ sub(count, zr, count_orig);
+
// Copy from low to high addresses
__ mov(start_to, to); // Save destination array start
address
__ b(L_load_element);
@@ -1524,7 +1532,7 @@
// ======== begin loop ========
// (Loop is rotated; its entry is L_load_element.)
// Loop control:
- // for (; count != 0; count--) {
+ // for (count=-length; count != 0; count++) {
// copied_oop = load_heap_oop(from++);
// ... generate_type_check ...;
// store_heap_oop(to++, copied_oop);
@@ -1533,7 +1541,7 @@
__ BIND(L_store_element);
__ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8),
copied_oop); // store the oop
- __ sub(count, count, 1);
+ __ add(count, count, 1);
__ cbz(count, L_do_card_marks);
// ======== loop entry is here ========
@@ -1545,23 +1553,30 @@
generate_type_check(r19_klass, ckoff, ckval, L_store_element);
// ======== end loop ========
+ // exit point for failure
// It was a real error; we must depend on the caller to finish the job.
- // Register r0 = number of *remaining* oops
+ // Register count = -1 * remaining oops, count_save = total oops.
// Emit GC store barriers for the oops we have copied and report
// their number to the caller.
- DEBUG_ONLY(__ nop());
+ __ add(to, to, -heapOopSize); // make an inclusive end pointer
+ gen_write_ref_array_post_barrier(start_to, to, rscratch1);
+ __ add(r0, count_orig, count); // K = partially copied oop count
+ __ eon(r0, r0, zr); // report (-1^K) to caller
+ __ b(L_cleanup);
- // Common exit point (success or failure).
+ // exit point for success.
__ BIND(L_do_card_marks);
__ add(to, to, -heapOopSize); // make an inclusive end pointer
gen_write_ref_array_post_barrier(start_to, to, rscratch1);
- __ pop(r18->bit() | r19->bit() | r20->bit(), sp);
+ __ mov(r0, 0);
+ // exit cleanup for success and failure
+ __ bind(L_cleanup);
+ __ pop(r18->bit() | r19->bit() | r20->bit()| r21->bit(), sp);
inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
__ bind(L_done);
- __ mov(r0, count); // report count remaining to
caller
__ leave();
__ ret(lr);
More information about the aarch64-port-dev
mailing list