[aarch64-port-dev ] RFR: 8159063: aarch64: optimise unaligned array copy long

Andrew Dinn adinn at redhat.com
Wed Jun 22 12:18:05 UTC 2016

On 21/06/16 15:48, Andrew Haley wrote:
> On 21/06/16 15:25, Andrew Dinn wrote:
>> Andrew Haley, are you ok to go with this patch for JDK9 (once it is
>> agreed that we can proceed)?
> Yes.


Would it be possible to push this to JDK9 now? Andrew agrees that my
revised version of Ed's patch is preferable (it's provided below as an
export -- if you prefer I can provide a webrev).


Andrew Dinn
Senior Principal Software Engineer
Red Hat UK Ltd
Registered in England and Wales under Company Registration No. 03798903
Directors: Michael Cunningham, Michael ("Mike") O'Neill, Eric Shander
----- 8< -------- 8< -------- 8< -------- 8< -------- 8< -------- 8< ---
# HG changeset patch
# User enevill
# Date 1466073728 14400
#      Thu Jun 16 06:42:08 2016 -0400
# Node ID 9570d342c207e43ce262a9466eb862e68e790c54
# Parent  385eae4265489a6b9f92b6cc6ab37467c8454563
8159063: aarch64: optimise unaligned array copy long
Reviewed-by: aph, adinn

diff -r 385eae426548 -r 9570d342c207 src/cpu/aarch64/vm/globals_aarch64.hpp
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Wed Jun 15 12:44:20 2016 +0200
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jun 16 06:42:08 2016 -0400
@@ -118,6 +118,7 @@
 // Don't attempt to use Neon on builtin sim until builtin sim supports it
 #define UseCRC32 false
 #define UseSIMDForMemoryOps    false
+#define AvoidUnalignedAcesses false

 #define UseBuiltinSim           false
@@ -144,6 +145,8 @@
           "Use CRC32 instructions for CRC32 computation")               \
   product(bool, UseSIMDForMemoryOps, false,                             \
           "Use SIMD instructions in generated memory move code")        \
+  product(bool, AvoidUnalignedAccesses, false,                          \
+          "Avoid generating unaligned memory accesses")                 \
   product(bool, UseLSE, false,                                          \
           "Use LSE instructions")                                       \
   product(bool, UseBlockZeroing, true,                                  \
diff -r 385eae426548 -r 9570d342c207
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Wed Jun 15 12:44:20
2016 +0200
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Jun 16 06:42:08
2016 -0400
@@ -801,6 +801,12 @@
     StubCodeMark mark(this, "StubRoutines", stub_name);
     __ align(CodeEntryAlignment);
     __ bind(start);
+    Label unaligned_copy_long;
+    if (AvoidUnalignedAccesses) {
+      __ tbnz(d, 3, unaligned_copy_long);
+    }
     if (direction == copy_forwards) {
       __ sub(s, s, bias);
       __ sub(d, d, bias);
@@ -901,6 +907,198 @@

     __ ret(lr);
+    if (AvoidUnalignedAccesses) {
+      Label drain, again;
+      // Register order for storing. Order is different for backward copy.
+      __ bind(unaligned_copy_long);
+      // source address is even aligned, target odd aligned
+      //
+      // when forward copying word pairs we read long pairs at offsets
+      // {0, 2, 4, 6} (in long words). when backwards copying we read
+      // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
+      // address by -2 in the forwards case so we can compute the
+      // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
+      // or -1.
+      //
+      // when forward copying we need to store 1 word, 3 pairs and
+      // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
+      // zero offset We adjust the destination by -1 which means we
+      // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
+      //
+      // When backwards copyng we need to store 1 word, 3 pairs and
+      // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
+      // offsets {1, 3, 5, 7, 8} * unit.
+      if (direction == copy_forwards) {
+        __ sub(s, s, 16);
+        __ sub(d, d, 8);
+      }
+      // Fill 8 registers
+      //
+      // for forwards copy s was offset by -16 from the original input
+      // value of s so the register contents are at these offsets
+      // relative to the 64 bit block addressed by that original input
+      // and so on for each successive 64 byte block when s is updated
+      //
+      // t0 at offset 0,  t1 at offset 8
+      // t2 at offset 16, t3 at offset 24
+      // t4 at offset 32, t5 at offset 40
+      // t6 at offset 48, t7 at offset 56
+      // for backwards copy s was not offset so the register contents
+      // are at these offsets into the preceding 64 byte block
+      // relative to that original input and so on for each successive
+      // preceding 64 byte block when s is updated. this explains the
+      // slightly counter-intuitive looking pattern of register usage
+      // in the stp instructions for backwards copy.
+      //
+      // t0 at offset -16, t1 at offset -8
+      // t2 at offset -32, t3 at offset -24
+      // t4 at offset -48, t5 at offset -40
+      // t6 at offset -64, t7 at offset -56
+      __ ldp(t0, t1, Address(s, 2 * unit));
+      __ ldp(t2, t3, Address(s, 4 * unit));
+      __ ldp(t4, t5, Address(s, 6 * unit));
+      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+      __ subs(count, count, 16);
+      __ br(Assembler::LO, drain);
+      int prefetch = PrefetchCopyIntervalInBytes;
+      bool use_stride = false;
+      if (direction == copy_backwards) {
+         use_stride = prefetch > 256;
+         prefetch = -prefetch;
+         if (use_stride) __ mov(stride, prefetch);
+      }
+      __ bind(again);
+      if (PrefetchCopyIntervalInBytes > 0)
+        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch),
+      if (direction == copy_forwards) {
+	// allowing for the offset of -8 the store instructions place
+	// registers into the target 64 bit block at the following
+	// offsets
+	//
+	// t0 at offset 0
+	// t1 at offset 8,  t2 at offset 16
+	// t3 at offset 24, t4 at offset 32
+	// t5 at offset 40, t6 at offset 48
+	// t7 at offset 56
+        __ str(t0, Address(d, 1 * unit));
+        __ stp(t1, t2, Address(d, 2 * unit));
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ stp(t3, t4, Address(d, 4 * unit));
+        __ ldp(t2, t3, Address(s, 4 * unit));
+        __ stp(t5, t6, Address(d, 6 * unit));
+        __ ldp(t4, t5, Address(s, 6 * unit));
+        __ str(t7, Address(__ pre(d, 8 * unit)));
+        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+      } else {
+	// d was not offset when we started so the registers are
+	// written into the 64 bit block preceding d with the following
+	// offsets
+	//
+	// t1 at offset -8
+	// t3 at offset -24, t0 at offset -16
+	// t5 at offset -48, t2 at offset -32
+	// t7 at offset -56, t4 at offset -48
+	//                   t6 at offset -64
+	//
+	// note that this matches the offsets previously noted for the
+	// loads
+        __ str(t1, Address(d, 1 * unit));
+        __ stp(t3, t0, Address(d, 3 * unit));
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ stp(t5, t2, Address(d, 5 * unit));
+        __ ldp(t2, t3, Address(s, 4 * unit));
+        __ stp(t7, t4, Address(d, 7 * unit));
+        __ ldp(t4, t5, Address(s, 6 * unit));
+        __ str(t6, Address(__ pre(d, 8 * unit)));
+        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+      }
+      __ subs(count, count, 8);
+      __ br(Assembler::HS, again);
+      // Drain
+      //
+      // this uses the same pattern of offsets and register arguments
+      // as above
+      __ bind(drain);
+      if (direction == copy_forwards) {
+        __ str(t0, Address(d, 1 * unit));
+        __ stp(t1, t2, Address(d, 2 * unit));
+        __ stp(t3, t4, Address(d, 4 * unit));
+        __ stp(t5, t6, Address(d, 6 * unit));
+        __ str(t7, Address(__ pre(d, 8 * unit)));
+      } else {
+        __ str(t1, Address(d, 1 * unit));
+        __ stp(t3, t0, Address(d, 3 * unit));
+        __ stp(t5, t2, Address(d, 5 * unit));
+        __ stp(t7, t4, Address(d, 7 * unit));
+        __ str(t6, Address(__ pre(d, 8 * unit)));
+      }
+      // now we need to copy any remaining part block which may
+      // include a 4 word block subblock and/or a 2 word subblock.
+      // bits 2 and 1 in the count are the tell-tale for whetehr we
+      // have each such subblock
+      {
+        Label L1, L2;
+        __ tbz(count, exact_log2(4), L1);
+	// this is the same as above but copying only 4 longs hence
+	// with ony one intervening stp between the str instructions
+	// but note that the offsets and registers still follow the
+	// same pattern
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
+        if (direction == copy_forwards) {
+          __ str(t0, Address(d, 1 * unit));
+          __ stp(t1, t2, Address(d, 2 * unit));
+          __ str(t3, Address(__ pre(d, 4 * unit)));
+        } else {
+          __ str(t1, Address(d, 1 * unit));
+          __ stp(t3, t0, Address(d, 3 * unit));
+          __ str(t2, Address(__ pre(d, 4 * unit)));
+        }
+        __ bind(L1);
+        __ tbz(count, 1, L2);
+	// this is the same as above but copying only 2 longs hence
+	// there is no intervening stp between the str instructions
+	// but note that the offset and register patterns are still
+	// the same
+        __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
+        if (direction == copy_forwards) {
+          __ str(t0, Address(d, 1 * unit));
+          __ str(t1, Address(__ pre(d, 2 * unit)));
+        } else {
+          __ str(t1, Address(d, 1 * unit));
+          __ str(t0, Address(__ pre(d, 2 * unit)));
+        }
+        __ bind(L2);
+	// for forwards copy we need to re-adjust the offsets we
+	// applied so that s and d are follow the last words written
+	if (direction == copy_forwards) {
+	  __ add(s, s, 16);
+	  __ add(d, d, 8);
+	}
+      }
+      __ ret(lr);
+      }

   // Small copy: less than 16 bytes.
@@ -1024,11 +1222,9 @@
     // (96 bytes if SIMD because we do 32 byes per instruction)
     __ bind(copy80);
     if (UseSIMDForMemoryOps) {
-      __ ldpq(v0, v1, Address(s, 0));
-      __ ldpq(v2, v3, Address(s, 32));
+      __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
       __ ldpq(v4, v5, Address(send, -32));
-      __ stpq(v0, v1, Address(d, 0));
-      __ stpq(v2, v3, Address(d, 32));
+      __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
       __ stpq(v4, v5, Address(dend, -32));
     } else {
       __ ldp(t0, t1, Address(s, 0));
diff -r 385eae426548 -r 9570d342c207
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Wed Jun 15 12:44:20 2016
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Jun 16 06:42:08 2016
@@ -175,7 +175,15 @@

   // Enable vendor specific features
-  if (_cpu == CPU_CAVIUM && _variant == 0) _features |= CPU_DMB_ATOMICS;
+  if (_cpu == CPU_CAVIUM) {
+    if (_variant == 0) _features |= CPU_DMB_ATOMICS;
+    if (FLAG_IS_DEFAULT(AvoidUnalignedAccesses)) {
+      FLAG_SET_DEFAULT(AvoidUnalignedAccesses, true);
+    }
+    if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps)) {
+      FLAG_SET_DEFAULT(UseSIMDForMemoryOps, (_variant > 0));
+    }
+  }
   if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03))
_features |= CPU_A53MAC;
   if (_cpu == CPU_ARM && (_model == 0xd07 || _model2 == 0xd07))
_features |= CPU_STXR_PREFETCH;
   // If an olde style /proc/cpuinfo (cpu_lines == 1) then if _model is
an A57 (0xd07)
----- 8< -------- 8< -------- 8< -------- 8< -------- 8< -------- 8< ---

More information about the hotspot-compiler-dev mailing list