[aarch64-port-dev ] RFR: Add support for G1GC

Edward Nevill ed at camswl.com
Sat Mar 22 19:26:45 UTC 2014


On Fri, 2014-03-21 at 15:44 +0000, Andrew Haley wrote:
> On 03/21/2014 01:59 PM, Edward Nevill wrote:
> >> What is the commented-out code for?  Why is this particular set of registers
> >> > pushed?
> > It should be push(r0, r1), the push of rscratch1, rscratch2 is
> > unnecessary.
> > 
> > r0 needs to be saved, I also push r1 because it is free and because
> > I need to save r0..r7 around the call to g1_wb_post later and by
> > saving r1 here I only have to save r2..r7 later.
> 
> But why do you not need to save all registers?  This is in the middle
> of C1-generated code, isn't it?  So anything may be live.  And the
> native code you're calling may trash these registers.  I think you need
> everything up to r18, excluding rscratch1 and rscratch2.

You are correct. I got confused between argument registers and caller save registers which are not the same on aarch64.

Thanks for this, you saved me an 'interesting' debugging session!

Revised patch below. Tested with jtreg and specjbb2013.

OK to push now?

Ed.

--- CUT HERE ---
exporting patch:
# HG changeset patch
# User Edward Nevill edward.nevill at linaro.org
# Date 1395515628 0
#      Sat Mar 22 19:13:48 2014 +0000
# Node ID 39075bc8624fa8edd6394c68312e039db359c299
# Parent  9393c177ac9b9407f1f4e58bd662b719b40ded54
Add support for G1GC

diff -r 9393c177ac9b -r 39075bc8624f src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad     Wed Mar 19 16:15:50 2014 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad     Sat Mar 22 19:13:48 2014 +0000
@@ -5112,6 +5112,19 @@

 // Store Instructions

+// Store CMS card-mark Immediate
+instruct storeimmCM0(immI0 zero, memory mem)
+%{
+  match(Set mem (StoreCM mem zero));
+
+  ins_cost(MEMORY_REF_COST);
+  format %{ "strb zr, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_strb0(mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
 // Store Byte
 instruct storeB(iRegI src, memory mem)
 %{
@@ -5126,6 +5139,7 @@
   ins_pipe(pipe_class_memory);
 %}

+
 instruct storeimmB0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreB mem zero));
diff -r 9393c177ac9b -r 39075bc8624f src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp       Wed Mar 19 16:15:50 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp       Sat Mar 22 19:13:48 2014 +0000
@@ -542,14 +542,46 @@
 /////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS

-void G1PreBarrierStub::emit_code(LIR_Assembler* ce) { Unimplemented(); }
+void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
+  // At this point we know that marking is in progress.
+  // If do_load() is true then we have to emit the
+  // load of the previous value; otherwise it has already
+  // been loaded into _pre_val.
+
+  __ bind(_entry);
+  assert(pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = pre_val()->as_register();
+
+  if (do_load()) {
+    ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false /*wide*/, false /*unaligned*/);
+  }
+  __ cbz(pre_val_reg, _continuation);
+  ce->store_parameter(pre_val()->as_register(), 0);
+  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_pre_barrier_slow_id)));
+  __ b(_continuation);
+}

 jbyte* G1PostBarrierStub::_byte_map_base = NULL;

-jbyte* G1PostBarrierStub::byte_map_base_slow() { Unimplemented(); return 0; }
+jbyte* G1PostBarrierStub::byte_map_base_slow() {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->is_a(BarrierSet::G1SATBCTLogging),
+         "Must be if we're using this.");
+  return ((G1SATBCardTableModRefBS*)bs)->byte_map_base;
+}


-void G1PostBarrierStub::emit_code(LIR_Assembler* ce) { Unimplemented(); }
+void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  assert(addr()->is_register(), "Precondition.");
+  assert(new_val()->is_register(), "Precondition.");
+  Register new_val_reg = new_val()->as_register();
+  __ cbz(new_val_reg, _continuation);
+  ce->store_parameter(addr()->as_pointer_register(), 0);
+  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_post_barrier_slow_id)));
+  __ b(_continuation);
+}

 #endif // INCLUDE_ALL_GCS
 /////////////////////////////////////////////////////////////////////////////
diff -r 9393c177ac9b -r 39075bc8624f src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp    Wed Mar 19 16:15:50 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp    Sat Mar 22 19:13:48 2014 +0000
@@ -1866,47 +1866,47 @@
 void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {
   
   assert(left->is_single_cpu() || left->is_double_cpu(), "expect single or double register");
-  if (left->is_single_cpu()) {
-    assert (right->is_single_cpu() || right->is_constant(), "single register or constant expected");
-    if (right->is_constant()
-       && Assembler::operand_valid_for_logical_immediate(true, right->as_jint())) {
-
-      switch (code) {
-      case lir_logic_and: __ andw (dst->as_register(), left->as_register(), right->as_jint()); break;
-      case lir_logic_or:  __ orrw (dst->as_register(), left->as_register(), right->as_jint()); break;
-      case lir_logic_xor: __ eorw (dst->as_register(), left->as_register(), right->as_jint()); break;
-      default: ShouldNotReachHere(); break;
-      }
-    } else {
-      switch (code) {
-      case lir_logic_and: __ andw (dst->as_register(), left->as_register(), right->as_register()); break;
-      case lir_logic_or:  __ orrw (dst->as_register(), left->as_register(), right->as_register()); break;
-      case lir_logic_xor: __ eorw (dst->as_register(), left->as_register(), right->as_register()); break;
-      default: ShouldNotReachHere(); break;
-      }
-    }
-  } else {
-    assert (right->is_double_cpu() || right->is_constant(), "single register or constant expected");
-    if (right->is_double_cpu()) {
-      switch (code) {
-      case lir_logic_and: __ andr(dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
-      case lir_logic_or:  __ orr (dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
-      case lir_logic_xor: __ eor (dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
-      default:
-       ShouldNotReachHere();
-       break;
-      }
-    }
-  }
+  Register Rleft = left->is_single_cpu() ? left->as_register() :
+                                           left->as_register_lo();
+   if (dst->is_single_cpu()) {
+     Register Rdst = dst->as_register();
+     if (right->is_constant()) {
+       switch (code) {
+         case lir_logic_and: __ andw (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_or:  __ orrw (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_xor: __ eorw (Rdst, Rleft, right->as_jint()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       Register Rright = right->is_single_cpu() ? right->as_register() :
+                                                  right->as_register_lo();
+       switch (code) {
+         case lir_logic_and: __ andw (Rdst, Rleft, Rright); break;
+         case lir_logic_or:  __ orrw (Rdst, Rleft, Rright); break;
+         case lir_logic_xor: __ eorw (Rdst, Rleft, Rright); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   } else {
+     Register Rdst = dst->as_register_lo();
+     if (right->is_constant()) {
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, right->as_jlong()); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, right->as_jlong()); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, right->as_jlong()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       Register Rright = right->is_single_cpu() ? right->as_register() :
+                                                  right->as_register_lo();
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, Rright); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, Rright); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, Rright); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   }
 }


diff -r 9393c177ac9b -r 39075bc8624f src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp        Wed Mar 19 16:15:50 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp        Sat Mar 22 19:13:48 2014 +0000
@@ -42,6 +42,9 @@
 #include "runtime/vframe.hpp"
 #include "runtime/vframeArray.hpp"
 #include "vmreg_aarch64.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#endif


 // Implementation of StubAssembler
@@ -1148,6 +1151,137 @@
       }
       break;
 
+#if INCLUDE_ALL_GCS
+
+// Registers to be saved around calls to g1_wb_pre or g1_wb_post
+// R0 & R1 have already been saved earlier. R8 & R9 are rscratch1 & rscratch2
+#define G1_SAVE_REGS   r2->bit(1)|r3->bit(1)|r4->bit(1)|r5->bit(1)| \
+                       r6->bit(1)|r7->bit(1)| \
+                       r10->bit(1)|r11->bit(1)|r12->bit(1)|r13->bit(1)| \
+                       r14->bit(1)|r15->bit(1)|r16->bit(1)|r17->bit(1)
+
+    case g1_pre_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_pre_barrier", dont_gc_arguments);
+        // arg0 : previous value of memory
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        if (bs->kind() != BarrierSet::G1SATBCTLogging) {
+         __ mov(r0, (int)id);
+         __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), r0);
+         __ should_not_reach_here();
+          break;
+        }
+
+        const Register pre_val = r0;
+        const Register thread = rthread;
+        const Register tmp = rscratch1;
+
+        Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_active()));
+
+        Address queue_index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        Label done;
+        Label runtime;
+
+       __ push(r0->bit(1) | r1->bit(1), sp);
+        // Can we store original value in the thread's buffer?
+        f.load_argument(0, pre_val);
+        __ ldr(tmp, queue_index);
+        __ cbz(tmp, runtime);
+
+        __ sub(tmp, tmp, wordSize);
+        __ str(tmp, queue_index);
+        __ ldr(rscratch2, buffer);
+        __ add(tmp, tmp, rscratch2);
+        __ str(pre_val, Address(tmp, 0));
+        __ b(done);
+
+        __ bind(runtime);
+        __ push(G1_SAVE_REGS, sp);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
+       __ pop(G1_SAVE_REGS, sp);
+        __ bind(done);
+       __ pop(r0->bit(1) | r1->bit(1), sp);
+      }
+      break;
+    case g1_post_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_post_barrier", dont_gc_arguments);
+
+        // arg0: store_address
+        Address store_addr(rfp, 2*BytesPerWord);
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+        assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+        Label done;
+        Label runtime;
+
+        // At this point we know new_value is non-NULL and the new_value crosses regions.
+        // Must check to see if card is already dirty
+
+        const Register thread = rthread;
+
+        Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        const Register card_addr = rscratch2;
+
+       __ push(r0->bit(1) | r1->bit(1), sp);
+        f.load_argument(0, card_addr);
+        __ lsr(card_addr, card_addr, CardTableModRefBS::card_shift);
+        // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
+        // a valid address and therefore is not properly handled by the relocation code.
+       __ mov(rscratch1, (intptr_t)ct->byte_map_base);
+        __ add(card_addr, card_addr, rscratch1);
+        __ ldrb(rscratch1, Address(card_addr, 0));
+        __ cmpw(rscratch1, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+       __ br(Assembler::EQ, done);
+
+        __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
+        __ ldrb(rscratch1, Address(card_addr, 0));
+        __ cmpw(rscratch1, (int)CardTableModRefBS::dirty_card_val());
+       __ br(Assembler::EQ, done);
+
+        // storing region crossing non-NULL, card is clean.
+        // dirty card and log.
+
+        __ mov(rscratch1, (int)CardTableModRefBS::dirty_card_val());
+        __ strb(rscratch1, Address(card_addr, 0));
+
+        __ ldr(rscratch1, queue_index);
+        __ cbz(rscratch1, runtime);
+        __ sub(rscratch1, rscratch1, wordSize);
+        __ str(rscratch1, queue_index);
+
+        const Register buffer_addr = rscratch2;
+
+       __ push(card_addr->bit(1), sp);
+       __ ldr(buffer_addr, buffer);
+       __ add(rscratch1, buffer_addr, rscratch1);
+       __ pop(card_addr->bit(1), sp);
+       __ str(card_addr, Address(rscratch1, 0));
+       __ b(done);
+
+        __ bind(runtime);
+       __ push(G1_SAVE_REGS, sp);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+       __ pop(G1_SAVE_REGS, sp);
+        __ bind(done);
+       __ pop(r0->bit(1) | r1->bit(1), sp);
+
+      }
+      break;
+#endif
+
     case predicate_failed_trap_id:
       {
         StubFrame f(sasm, "predicate_failed_trap", dont_gc_arguments);
diff -r 9393c177ac9b -r 39075bc8624f src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp     Wed Mar 19 16:15:50 2014 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp     Sat Mar 22 19:13:48 2014 +0000
@@ -47,11 +47,12 @@
 // #include "runtime/os.hpp"
 // #include "runtime/sharedRuntime.hpp"
 // #include "runtime/stubRoutines.hpp"
-// #if INCLUDE_ALL_GCS
-// #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
-// #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
-// #include "gc_implementation/g1/heapRegion.hpp"
-// #endif
+
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc_implementation/g1/heapRegion.hpp"
+#endif

 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
@@ -2409,13 +2410,174 @@
                                           Register thread,
                                           Register tmp,
                                           bool tosca_live,
-                                          bool expand_call) { Unimplemented(); }
+                                          bool expand_call) {
+  // If expand_call is true then we expand the call_VM_leaf macro
+  // directly to skip generating the check by
+  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
+
+#ifdef _LP64
+  assert(thread == rthread, "must be");
+#endif // _LP64
+
+  Label done;
+  Label runtime;
+
+  assert(pre_val != noreg, "check this code");
+
+  if (obj != noreg)
+    assert_different_registers(obj, pre_val, tmp);
+
+  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_active()));
+  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+
+  // Is marking active?
+  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+    ldrw(tmp, in_progress);
+  } else {
+    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
+    ldrb(tmp, in_progress);
+  }
+  cbzw(tmp, done);
+
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    load_heap_oop(pre_val, Address(obj, 0));
+  }
+
+  // Is the previous value null?
+  cbz(pre_val, done);
+
+  // Can we store original value in the thread's buffer?
+  // Is index == 0?
+  // (The index field is typed as size_t.)
+
+  ldr(tmp, index);                      // tmp := *index_adr
+  cbz(tmp, runtime);                    // tmp == 0?
+                                        // If yes, goto runtime
+
+  sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
+  str(tmp, index);                      // *index_adr := tmp
+  ldr(rscratch1, buffer);
+  add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
+
+  // Record the previous value
+  str(pre_val, Address(tmp, 0));
+  b(done);
+
+  bind(runtime);
+  // save the live input values
+  push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
+
+  // Calling the runtime using the regular call_VM_leaf mechanism generates
+  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
+  // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
+  //
+  // If we care generating the pre-barrier without a frame (e.g. in the
+  // intrinsified Reference.get() routine) then ebp might be pointing to
+  // the caller frame and so this check will most likely fail at runtime.
+  //
+  // Expanding the call directly bypasses the generation of the check.
+  // So when we do not have have a full interpreter frame on the stack
+  // expand_call should be passed true.
+
+  if (expand_call) {
+    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
+    pass_arg1(this, thread);
+    pass_arg0(this, pre_val);
+    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
+  } else {
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
+  }
+
+  pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
+
+  bind(done);
+}

 void MacroAssembler::g1_write_barrier_post(Register store_addr,
                                            Register new_val,
                                            Register thread,
                                            Register tmp,
-                                           Register tmp2) { Unimplemented(); }
+                                           Register tmp2) {
+#ifdef _LP64
+  assert(thread == rthread, "must be");
+#endif // _LP64
+
+  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  mov(tmp, store_addr);
+  eor(tmp, tmp, new_val);
+  lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
+  cbz(tmp, done);
+
+  // crosses regions, storing NULL?
+
+  cbz(new_val, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+
+  ExternalAddress cardtable((address) ct->byte_map_base);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+  const Register card_addr = tmp;
+
+  mov(card_addr, store_addr);
+  lsr(card_addr, card_addr, CardTableModRefBS::card_shift);
+
+  unsigned long offset;
+  adrp(tmp2, cardtable, offset)
+
+  // get the address of the card
+  add(card_addr, card_addr, tmp2);
+  ldrb(tmp2, Address(card_addr, offset));
+  cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+  br(Assembler::EQ, done);
+
+  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
+  ldrb(tmp2, Address(card_addr, offset));
+  cmpw(tmp2, (int)CardTableModRefBS::dirty_card_val());
+  br(Assembler::EQ, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+
+  mov(tmp2, (int)CardTableModRefBS::dirty_card_val());
+  strb(tmp2, Address(card_addr, offset));
+
+  ldr(rscratch1, queue_index);
+  cbz(rscratch1, runtime);
+  sub(rscratch1, rscratch1, wordSize);
+  str(rscratch1, queue_index);
+
+  ldr(tmp2, buffer);
+  add(tmp2, tmp2, rscratch1);
+  str(card_addr, Address(tmp2, 0));
+  b(done);
+
+  bind(runtime);
+  // save the live input values
+  push(store_addr->bit(true) | new_val->bit(true), sp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+  pop(store_addr->bit(true) | new_val->bit(true), sp);
+
+  bind(done);
+}

 #endif // INCLUDE_ALL_GCS

--- CUt HERE ---




More information about the aarch64-port-dev mailing list