[aarch64-port-dev ] Rewrite arraycopy code

Fri Nov 22 09:25:19 PST 2013

I've probably spent far too much time on this.

The stub is allowed to clobber all the integer registers, so I make
use of them by double-buffering read and write blocks of 64 bytes,
avoiding memory stalls by always reading 8 words while writing the
previous block of 8 words.  I've tried to avoid increasing the
overhead for the common case of short blocks of memory.

I've also fixed the prefetch code.

Next week I'll optimize the memory zeroing code used by the allocator.

Andrew.



diff -r 65546f5b752a src/cpu/aarch64/vm/stubGenerator_aarch64.cpp

--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri Nov 15 07:44:18 2013 -0500
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri Nov 22 17:19:52 2013 +0000
@@ -45,6 +45,10 @@
 #include "opto/runtime.hpp"
 #endif

+#ifdef BUILTIN_SIM
+#include "../../../../../../simulator/simulator.hpp"
+#endif
+
 // Declaration and definition of StubGenerator (no .hpp file).
 // For a more detailed description of the stub routine structure
 // see the comment in stubRoutines.hpp
@@ -925,70 +929,182 @@
     copy_backwards = -1
   } copy_direction;

-  void copy_longs_small(Register s, Register d, Register count, copy_direction direction) {
-    Label again, around;
-    __ cbz(count, around);
+  // Bulk copy of blocks of 8 words.
+  //
+  // count is a count of words.
+  //
+  // Precondition: count >= 2
+  //
+  // Postconditions:
+  //
+  // The least significant bit of count contains the remaining count
+  // of words to copy.  The rest of count is trash.
+  //
+  // s and d are adjusted to point to the remaining words to copy
+  //
+  void generate_copy_longs(Label &start, Register s, Register d, Register count,
+			   copy_direction direction) {
+    int unit = wordSize * direction;
+
+    int offset;
+    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
+      t4 = r7, t5 = r10, t6 = r11, t7 = r12;
+
+    assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
+    assert_different_registers(s, d, count, rscratch1);
+
+    Label again, large, small;
+    __ align(6);
+    __ bind(start);
+    __ cmp(count, 8);
+    __ br(Assembler::LO, small);
+    if (direction == copy_forwards) {
+      __ sub(s, s, 2 * wordSize);
+      __ sub(d, d, 2 * wordSize);
+    }
+    __ subs(count, count, 16);
+    __ br(Assembler::GE, large);
+
+    // 8 <= count < 16 words.  Copy 8.
+    __ ldp(t0, t1, Address(s, 2 * unit));
+    __ ldp(t2, t3, Address(s, 4 * unit));
+    __ ldp(t4, t5, Address(s, 6 * unit));
+    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
+    __ stp(t0, t1, Address(d, 2 * unit));
+    __ stp(t2, t3, Address(d, 4 * unit));
+    __ stp(t4, t5, Address(d, 6 * unit));
+    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+
+    if (direction == copy_forwards) {
+      __ add(s, s, 2 * wordSize);
+      __ add(d, d, 2 * wordSize);
+    }
+
+    {
+      Label L1, L2;
+      __ bind(small);
+      __ tbz(count, exact_log2(4), L1);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L1);
+
+      __ tbz(count, 1, L2);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L2);
+    }
+
+    __ ret(lr);
+
+    __ align(6);
+    __ bind(large);
+
+    // Fill 8 registers
+    __ ldp(t0, t1, Address(s, 2 * unit));
+    __ ldp(t2, t3, Address(s, 4 * unit));
+    __ ldp(t4, t5, Address(s, 6 * unit));
+    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
     __ bind(again);
-    __ ldr(r16, Address(__ adjust(s, wordSize * direction, direction == copy_backwards)));
-    __ str(r16, Address(__ adjust(d, wordSize * direction, direction == copy_backwards)));
-    __ sub(count, count, 1);
-    __ cbnz(count, again);
-    __ bind(around);
+
+    if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
+      __ prfm(PLDL1KEEP, Address(s, PrefetchCopyIntervalInBytes));
+
+    __ stp(t0, t1, Address(d, 2 * unit));
+    __ ldp(t0, t1, Address(s, 2 * unit));
+    __ stp(t2, t3, Address(d, 4 * unit));
+    __ ldp(t2, t3, Address(s, 4 * unit));
+    __ stp(t4, t5, Address(d, 6 * unit));
+    __ ldp(t4, t5, Address(s, 6 * unit));
+    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
+    __ subs(count, count, 8);
+    __ br(Assembler::HS, again);
+
+    // Drain
+    __ stp(t0, t1, Address(d, 2 * unit));
+    __ stp(t2, t3, Address(d, 4 * unit));
+    __ stp(t4, t5, Address(d, 6 * unit));
+    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+
+    if (direction == copy_forwards) {
+      __ add(s, s, 2 * wordSize);
+      __ add(d, d, 2 * wordSize);
+    }
+
+    {
+      Label L1, L2;
+      __ tbz(count, exact_log2(4), L1);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L1);
+
+      __ tbz(count, 1, L2);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L2);
+    }
+
+    __ ret(lr);
   }

-  void copy_longs(Register s, Register d, Register count, Register tmp, copy_direction direction) {
-    __ andr(tmp, count, 3);
-    copy_longs_small(s, d, tmp, direction);
-    __ andr(count, count, -4);
+  // Small copy: less than 16 bytes.
+  //
+  // NB: Ignores all of the bits of count which represent more than 15
+  // bytes, so a caller doesn't have to mask them.

-    Label again, around;
-    __ cbz(count, around);
-    __ push(r18->bit() | r19->bit(), sp);
-    __ bind(again);
-    if (direction != copy_backwards) {
-      __ prfm(Address(s, direction == copy_forwards ? 4 * wordSize : -6 * wordSize));
-      __ prfm(Address(s, direction == copy_forwards ? 6 * wordSize : -8 * wordSize));
+  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
+    bool is_backwards = step < 0;
+    size_t granularity = abs(step);
+    int direction = is_backwards ? -1 : 1;
+    int unit = wordSize * direction;
+
+    Label Lpair, Lword, Lint, Lshort, Lbyte;
+
+    assert(granularity
+	   && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
+
+    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
+
+    // ??? I don't know if this bit-test-and-branch is the right thing
+    // to do.  It does a lot of jumping, resulting in several
+    // mispredicted branches.  It might make more sense to do this
+    // with something like Duff's device with a single computed branch.
+
+    __ tbz(count, 3 - exact_log2(granularity), Lword);
+    __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
+    __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
+    __ bind(Lword);
+
+    if (granularity <= sizeof (jint)) {
+      __ tbz(count, 2 - exact_log2(granularity), Lint);
+      __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
+      __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
+      __ bind(Lint);
     }
-    __ ldp(r16, r17, Address(__ adjust(s, 2 * wordSize * direction, direction == copy_backwards)));
-    __ ldp(r18, r19, Address(__ adjust(s, 2 * wordSize * direction, direction == copy_backwards)));
-    __ stp(r16, r17, Address(__ adjust(d, 2 * wordSize * direction, direction == copy_backwards)));
-    __ stp(r18, r19, Address(__ adjust(d, 2 * wordSize * direction, direction == copy_backwards)));
-    __ subs(count, count, 4);
-    __ cbnz(count, again);
-    __ pop(r18->bit() | r19->bit(), sp);
-    __ bind(around);
+
+    if (granularity <= sizeof (jshort)) {
+      __ tbz(count, 1 - exact_log2(granularity), Lshort);
+      __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
+      __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
+      __ bind(Lshort);
+    }
+
+    if (granularity <= sizeof (jbyte)) {
+      __ tbz(count, 0, Lbyte);
+      __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
+      __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
+      __ bind(Lbyte);
+    }
   }

-  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step, Label &done) {
-    // Small copy: less than one word.
-    bool is_backwards = step < 0;
-    int granularity = abs(step);
-
-    __ cbz(count, done);
-    {
-      Label loop;
-      __ bind(loop);
-      switch (granularity) {
-      case 1:
-	__ ldrb(tmp, Address(__ adjust(s, step, is_backwards)));
-	__ strb(tmp, Address(__ adjust(d, step, is_backwards)));
-	break;
-      case 2:
-	__ ldrh(tmp, Address(__ adjust(s, step, is_backwards)));
-	__ strh(tmp, Address(__ adjust(d, step, is_backwards)));
-	break;
-      case 4:
-	__ ldrw(tmp, Address(__ adjust(s, step, is_backwards)));
-	__ strw(tmp, Address(__ adjust(d, step, is_backwards)));
-	break;
-      default:
-	assert(false, "copy_memory called with impossible step");
-      }
-      __ sub(count, count, 1);
-      __ cbnz(count, loop);
-      __ b(done);
-    }
-  }
+  Label copy_f, copy_b;

   // All-singing all-dancing memory copy.
   //
@@ -1002,72 +1118,90 @@
     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
     bool is_backwards = step < 0;
     int granularity = abs(step);
+    const Register t0 = r3, t1 = r4;

     if (is_backwards) {
       __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
       __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
     }

-    if (granularity == wordSize) {
-      copy_longs(c_rarg0, c_rarg1, c_rarg2, rscratch1, direction);
-      return;
+    Label done, tail;
+
+    __ cmp(count, 16/granularity);
+    __ br(Assembler::LO, tail);
+
+    // Now we've got the small case out of the way we can align the
+    // source address on a 2-word boundary.
+
+    Label aligned;
+
+    if (is_aligned) {
+      // We may have to adjust by 1 word to get s 2-word-aligned.
+      __ tbz(s, exact_log2(wordSize), aligned);
+      __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
+      __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
+      __ sub(count, count, wordSize/granularity);
+    } else {
+      if (is_backwards) {
+	__ andr(rscratch2, s, 2 * wordSize - 1);
+      } else {
+	__ neg(rscratch2, s);
+	__ andr(rscratch2, rscratch2, 2 * wordSize - 1);
+      }
+      // rscratch2 is the byte adjustment needed to align s.
+      __ cbz(rscratch2, aligned);
+      __ lsr(rscratch2, rscratch2, exact_log2(granularity));
+      __ sub(count, count, rscratch2);
+
+#if 0
+      // ?? This code is only correct for a disjoint copy.  It may or
+      // may not make sense to use it in that case.
+
+      // Copy the first pair; s and d may not be aligned.
+      __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
+      __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
+
+      // Align s and d, adjust count
+      if (is_backwards) {
+	__ sub(s, s, rscratch2);
+	__ sub(d, d, rscratch2);
+      } else {
+	__ add(s, s, rscratch2);
+	__ add(d, d, rscratch2);
+      }
+#else
+      copy_memory_small(s, d, rscratch2, rscratch1, step);
+#endif
     }

-    Label done, large;
+    __ cmp(count, 16/granularity);
+    __ br(Assembler::LT, tail);
+    __ bind(aligned);

-    if (! is_aligned) {
-      __ cmp(count, wordSize/granularity);
-      __ br(Assembler::HS, large);
-      copy_memory_small(s, d, count, tmp, step, done);
-      __ bind(large);
-
-      // Now we've got the small case out of the way we can align the
-      // source address.
-      {
-	Label skip1, skip2, skip4;
-
-	switch (granularity) {
-	case 1:
-	  __ tst(s, 1);
-	  __ br(Assembler::EQ, skip1);
-	  __ ldrb(tmp, Address(__ adjust(s, direction, is_backwards)));
-	  __ strb(tmp, Address(__ adjust(d, direction, is_backwards)));
-	  __ sub(count, count, 1);
-	  __ bind(skip1);
-	  // fall through
-	case 2:
-	  __ tst(s, 2/granularity);
-	  __ br(Assembler::EQ, skip2);
-	  __ ldrh(tmp, Address(__ adjust(s, 2 * direction, is_backwards)));
-	  __ strh(tmp, Address(__ adjust(d, 2 * direction, is_backwards)));
-	  __ sub(count, count, 2/granularity);
-	  __ bind(skip2);
-	  // fall through
-	case 4:
-	  __ tst(s, 4/granularity);
-	  __ br(Assembler::EQ, skip4);
-	  __ ldrw(tmp, Address(__ adjust(s, 4 * direction, is_backwards)));
-	  __ strw(tmp, Address(__ adjust(d, 4 * direction, is_backwards)));
-	  __ sub(count, count, 4/granularity);
-	  __ bind(skip4);
-	}
-      }
-    }
-
-    // s is now word-aligned.
+    // s is now 2-word-aligned.

     // We have a count of units and some trailing bytes.  Adjust the
     // count and do a bulk copy of words.
     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
-    __ sub(count, count, rscratch2, Assembler::LSL, exact_log2(wordSize/granularity));
-
-    copy_longs(s, d, rscratch2, rscratch1, direction);
+    if (direction == copy_forwards)
+      __ bl(copy_f);
+    else
+      __ bl(copy_b);

     // And the tail.

-    copy_memory_small(s, d, count, tmp, step, done);
+    __ bind(tail);
+    copy_memory_small(s, d, count, tmp, step);
+  }

-    __ bind(done);
+
+  void clobber_registers() {
+#ifdef ASSERT
+    __ mov(rscratch1, (uint64_t)0xdeadbeef);
+    __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
+    for (Register r = r3; r <= r18; r++)
+      if (r != rscratch1) __ mov(r, rscratch1);
+#endif
   }

   // Scan over array at a for count oops, verifying each one.
@@ -1122,7 +1256,6 @@
       BLOCK_COMMENT("Entry:");
     }
     __ enter();
-    __ push(r16->bit() | r17->bit(), sp);
     if (is_oop) {
       __ push(d->bit() | count->bit(), sp);
       // no registers are destroyed by this call
@@ -1137,9 +1270,14 @@
       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
       gen_write_ref_array_post_barrier(d, count, rscratch1);
     }
-    __ pop(r16->bit() | r17->bit(), sp);
     __ leave();
     __ ret(lr);
+#ifdef BUILTIN_SIM
+    {
+      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+      sim->notifyCompile(const_cast<char*>(name), start);
+    }
+#endif
     return start;
   }

@@ -1170,7 +1308,6 @@
     __ br(Assembler::LS, nooverlap_target);

     __ enter();
-    __ push(r16->bit() | r17->bit(), sp);
     if (is_oop) {
       __ push(d->bit() | count->bit(), sp);
       // no registers are destroyed by this call
@@ -1185,10 +1322,14 @@
       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
       gen_write_ref_array_post_barrier(d, count, rscratch1);
     }
-    __ pop(r16->bit() | r17->bit(), sp);
     __ leave();
     __ ret(lr);
-
+#ifdef BUILTIN_SIM
+    {
+      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+      sim->notifyCompile(const_cast<char*>(name), start);
+    }
+#endif
     return start;
 }

@@ -1637,6 +1778,9 @@
     address entry_jlong_arraycopy;
     address entry_checkcast_arraycopy;

+    generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
+    generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
+
     //*** jbyte
     // Always need aligned and unaligned versions
     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
diff -r 65546f5b752a src/cpu/aarch64/vm/vm_version_aarch64.cpp
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Nov 15 07:44:18 2013 -0500
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Nov 22 17:19:52 2013 +0000
@@ -90,6 +90,7 @@
   FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
   FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
   FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
+  FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
 }

 void VM_Version::initialize() {
diff -r 65546f5b752a src/os_cpu/linux_aarch64/vm/prefetch_linux_aarch64.inline.hpp
--- a/src/os_cpu/linux_aarch64/vm/prefetch_linux_aarch64.inline.hpp	Fri Nov 15 07:44:18 2013 -0500
+++ b/src/os_cpu/linux_aarch64/vm/prefetch_linux_aarch64.inline.hpp	Fri Nov 22 17:19:52 2013 +0000
@@ -30,13 +30,15 @@

 inline void Prefetch::read (void *loc, intx interval) {
 #ifndef BUILTIN_SIM
-  asm("prfm PLDL1KEEP, [%0, %1]" : : "r"(loc), "r"(interval));
+  if (interval >= 0)
+    asm("prfm PLDL1KEEP, [%0, %1]" : : "r"(loc), "r"(interval));
 #endif
 }

 inline void Prefetch::write(void *loc, intx interval) {
 #ifndef BUILTIN_SIM
-  asm("prfm PSTL1KEEP, [%0, %1]" : : "r"(loc), "r"(interval));
+  if (interval >= 0)
+    asm("prfm PSTL1KEEP, [%0, %1]" : : "r"(loc), "r"(interval));
 #endif
 }