[aarch64-port-dev ] Improve code generation for volatile operations and other barriers

Thu May 22 13:41:47 UTC 2014

Memory barriers have been by far the hardest thing to get right on
this port, and have been changed several times.  The problem is that
we want to use load acquire and store release instructions, but the
compilers prefer barriers.  We cannot simply emit acq/rel instructions
and elide the barriers because sometimes barriers have no associated
load or store (e.g. Unsafe fences) so we must emit them.

I have solved this problem differently in C1 and C2.  In C1 I have
given up trying to use ld.acq and st.rel, and emit conventional stores
and loads instead, along with separate barrier instructions.

In C2 I've written some logic to walk the ideal graph from the barrier
and (forwards of backwards) and find the associated load or store.  If
it's ordered (i.e. it will generate an acq/rel instruction) I elide
the barrier.

I also fixed a couple of thinkos.

Andrew.



# HG changeset patch
# User aph
# Date 1400757693 14400
#      Thu May 22 07:21:33 2014 -0400
# Node ID 0be4629243a868f0d4375b5cb8aff77b25b134b3
# Parent  5f4d7f52afc875fab4af0c68c3b657a2e8bd7283
Improve code generation for volatile operations and other barriers.

diff -r 5f4d7f52afc8 -r 0be4629243a8 src/cpu/aarch64/vm/aarch64.ad

--- a/src/cpu/aarch64/vm/aarch64.ad	Wed May 21 13:00:24 2014 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu May 22 07:21:33 2014 -0400
@@ -753,53 +753,43 @@
   }
 };

-  // Returns true if Node n is followed by a MemBar node that
-  // will do an acquire. If so, this node must not do the acquire
-  // operation.
-  bool followed_by_acquire(const Node *n);
+  bool followed_by_ordered_store(const Node *barrier);
+  bool preceded_by_ordered_load(const Node *barrier);
+
 %}

 source %{

-// Optimize load-acquire.
-//
-// Check if acquire is unnecessary due to following operation that does
-// acquire anyways.
-// Walk the pattern:
-//
-//      n: Load.acq
-//           |
-//      MemBarAcquire
-//       |         |
-//  Proj(ctrl)  Proj(mem)
-//       |         |
-//   MemBarRelease/Volatile
-//
-bool followed_by_acquire(const Node *load) {
-  if (!load->is_Load())
+  // AArch64 has load acquire and store release instructions which we
+  // use for ordered memory accesses, e.g. for volatiles.  The ideal
+  // graph generator also inserts memory barriers around volatile
+  // accesses, and we don't want to generate both barriers and acq/rel
+  // instructions.  So, when we emit a MemBarAcquire we look back in
+  // the ideal graph for an ordered load and only emit the barrier if
+  // we don't find one.
+
+bool preceded_by_ordered_load(const Node *barrier) {
+  Node *x = barrier->lookup(TypeFunc::Parms);
+
+  if (! x)
     return false;

-  // Find MemBarAcquire.
-  const Node *mba = NULL;
-  for (DUIterator_Fast imax, i = load->fast_outs(imax); i < imax; i++) {
-    const Node *out = load->fast_out(i);
-    if (out->Opcode() == Op_MemBarAcquire) {
-      if (out->in(0) == load) continue; // Skip control edge, membar should be found via precedence edge.
-      mba = out;
-      break;
-    }
-  }
-  if (!mba) return false;
-
-  // Find following MemBar node.
+  if (x->is_DecodeNarrowPtr())
+    x = x->in(1);
+
+  if (x->is_Load())
+    return ! x->as_Load()->is_unordered();
+
+  return false;
+}
+
+bool followed_by_ordered_store(const Node *barrier) {
+
+  // Find following mem node.
   //
-  // The following node must be reachable by control AND memory
-  // edge to assure no other operations are in between the two nodes.
-  //
-  // So first get the Proj node, mem_proj, to use it to iterate forward.
   Node *mem_proj = NULL;
-  for (DUIterator_Fast imax, i = mba->fast_outs(imax); i < imax; i++) {
-    mem_proj = mba->fast_out(i);      // Throw out-of-bounds if proj not found
+  for (DUIterator_Fast imax, i = barrier->fast_outs(imax); i < imax; i++) {
+    mem_proj = barrier->fast_out(i);      // Throw out-of-bounds if proj not found
     assert(mem_proj->is_Proj(), "only projections here");
     ProjNode *proj = mem_proj->as_Proj();
     if (proj->_con == TypeFunc::Memory &&
@@ -808,22 +798,11 @@
   }
   assert(mem_proj->as_Proj()->_con == TypeFunc::Memory, "Graph broken");

-  // Search MemBar behind Proj. If there are other memory operations
-  // behind the Proj we lost.
+  // Search behind Proj.
   for (DUIterator_Fast jmax, j = mem_proj->fast_outs(jmax); j < jmax; j++) {
     Node *x = mem_proj->fast_out(j);
-    // Proj might have an edge to a store or load node which precedes the membar.
-    if (x->is_Mem()) return false;
-
-    int xop = x->Opcode();
-    if (xop == Op_MemBarVolatile) {
-      // Make sure we're not missing Call/Phi/MergeMem by checking
-      // control edges. The control edge must directly lead back
-      // to the MemBarAcquire
-      Node *ctrl_proj = x->in(0);
-      if (ctrl_proj->is_Proj() && ctrl_proj->in(0) == mba) {
-        return true;
-      }
+    if (x->is_Store() && ! x->as_Store()->is_unordered()) {
+      return true;
     }
   }

@@ -2352,13 +2331,12 @@
     }
     Label retry_load, done;
     __ bind(retry_load);
-    __ ldaxr(rscratch1, addr_reg);
+    __ ldar(rscratch1, addr_reg);
     __ cmp(rscratch1, old_reg);
     __ br(Assembler::NE, done);
     __ stlxr(rscratch1, new_reg, addr_reg);
     __ cbnzw(rscratch1, retry_load);
     __ bind(done);
-    __ membar(__ AnyAny);
   %}

   enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
@@ -2392,13 +2370,12 @@
     }
     Label retry_load, done;
     __ bind(retry_load);
-    __ ldaxrw(rscratch1, addr_reg);
+    __ ldarw(rscratch1, addr_reg);
     __ cmpw(rscratch1, old_reg);
     __ br(Assembler::NE, done);
     __ stlxrw(rscratch1, new_reg, addr_reg);
     __ cbnzw(rscratch1, retry_load);
     __ bind(done);
-    __ membar(__ AnyAny);
   %}

   // auxiliary used for CompareAndSwapX to set result register
@@ -4748,7 +4725,7 @@
 instruct loadB(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadB mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrsbw  $dst, $mem\t# byte" %}
@@ -4762,7 +4739,7 @@
 instruct loadB2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadB mem)));
-  predicate(n->in(1)->as_Load()->is_unordered() || followed_by_acquire(n->in(1)));
+  predicate(n->in(1)->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrsb  $dst, $mem\t# byte" %}
@@ -4776,7 +4753,7 @@
 instruct loadUB(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadUB mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrbw  $dst, $mem\t# byte" %}
@@ -4790,7 +4767,7 @@
 instruct loadUB2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadUB mem)));
-  predicate(n->in(1)->as_Load()->is_unordered() || followed_by_acquire(n->in(1)));
+  predicate(n->in(1)->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrb  $dst, $mem\t# byte" %}
@@ -4804,7 +4781,7 @@
 instruct loadS(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadS mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrshw  $dst, $mem\t# short" %}
@@ -4818,7 +4795,7 @@
 instruct loadS2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadS mem)));
-  predicate(n->in(1)->as_Load()->is_unordered() || followed_by_acquire(n->in(1)));
+  predicate(n->in(1)->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrsh  $dst, $mem\t# short" %}
@@ -4832,7 +4809,7 @@
 instruct loadUS(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadUS mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrh  $dst, $mem\t# short" %}
@@ -4846,7 +4823,7 @@
 instruct loadUS2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadUS mem)));
-  predicate(n->in(1)->as_Load()->is_unordered() || followed_by_acquire(n->in(1)));
+  predicate(n->in(1)->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrh  $dst, $mem\t# short" %}
@@ -4860,7 +4837,7 @@
 instruct loadI(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadI mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# int" %}
@@ -4874,7 +4851,7 @@
 instruct loadI2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadI mem)));
-  predicate(n->in(1)->as_Load()->is_unordered() || followed_by_acquire(n->in(1)));
+  predicate(n->in(1)->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrsw  $dst, $mem\t# int" %}
@@ -4888,8 +4865,7 @@
 instruct loadUI2L(iRegLNoSp dst, memory mem, immL_32bits mask)
 %{
   match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
-  predicate(n->in(1)->in(1)->as_Load()->is_unordered()
-	    || followed_by_acquire(n->in(1)->in(1)));
+  predicate(n->in(1)->in(1)->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# int" %}
@@ -4903,7 +4879,7 @@
 instruct loadL(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (LoadL mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# int" %}
@@ -4930,7 +4906,7 @@
 instruct loadP(iRegPNoSp dst, memory mem)
 %{
   match(Set dst (LoadP mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# ptr" %}
@@ -4944,7 +4920,7 @@
 instruct loadN(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadN mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed ptr" %}
@@ -4958,7 +4934,7 @@
 instruct loadKlass(iRegPNoSp dst, memory mem)
 %{
   match(Set dst (LoadKlass mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# class" %}
@@ -4972,7 +4948,7 @@
 instruct loadNKlass(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadNKlass mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed class ptr" %}
@@ -4986,7 +4962,7 @@
 instruct loadF(vRegF dst, memory mem)
 %{
   match(Set dst (LoadF mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrs  $dst, $mem\t# float" %}
@@ -5000,7 +4976,7 @@
 instruct loadD(vRegD dst, memory mem)
 %{
   match(Set dst (LoadD mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate(n->as_Load()->is_unordered());

   ins_cost(4 * INSN_COST);
   format %{ "ldrd  $dst, $mem\t# double" %}
@@ -5821,7 +5797,7 @@
   match(LoadFence);
   ins_cost(VOLATILE_REF_COST);

-  format %{ "membar_acquire" %}
+  format %{ "load_fence" %}

   ins_encode %{
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
@@ -5830,6 +5806,7 @@
 %}

 instruct unnecessary_membar_acquire() %{
+  predicate(preceded_by_ordered_load(n));
   match(MemBarAcquire);
   ins_cost(0);

@@ -5842,6 +5819,20 @@
   ins_pipe(pipe_class_memory);
 %}

+instruct membar_acquire() %{
+  match(MemBarAcquire);
+  ins_cost(VOLATILE_REF_COST);
+
+  format %{ "membar_acquire" %}
+
+  ins_encode %{
+    __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+  %}
+
+  ins_pipe(pipe_class_memory);
+%}
+
+
 instruct membar_acquire_lock() %{
   match(MemBarAcquireLock);
   ins_cost(VOLATILE_REF_COST);
@@ -5862,19 +5853,32 @@
   format %{ "store_fence" %}

   ins_encode %{
-    __ membar(Assembler::StoreLoad|Assembler::StoreStore);
+    __ membar(Assembler::LoadStore|Assembler::StoreStore);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct unnecessary_membar_release() %{
+  match(MemBarRelease);
+  predicate(followed_by_ordered_store(n));
+  ins_cost(0);
+
+  format %{ "membar_release (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_release (elided)");
   %}
   ins_pipe(pipe_class_memory);
 %}

 instruct membar_release() %{
   match(MemBarRelease);
-  ins_cost(0);
-
-  format %{ "membar_release (elided)" %}
-
-  ins_encode %{
-    __ block_comment("membar_release (elided)");
+  ins_cost(VOLATILE_REF_COST);
+
+  format %{ "membar_release" %}
+
+  ins_encode %{
+    __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
   ins_pipe(pipe_class_memory);
 %}
@@ -5898,7 +5902,7 @@
   format %{ "membar_release_lock" %}

   ins_encode %{
-    __ membar(Assembler::StoreLoad|Assembler::StoreStore);
+    __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}

   ins_pipe(pipe_class_memory);