[aarch64-port-dev ] Very large code caches

Fri Jan 3 04:53:57 PST 2014

The AArch64 immediate call instructions span +/- 128 Mbytes.  This is
a good match for us: the default ReservedCodeCacheSize is 48M, and it
takes a lot to fill 128M.  Nonetheless, is is possible.  I can
envisage a multi-tenant system that generates huge amounts of code,
for example.  So, the question for us is: what should we do?

I've been kicking around a solution, attached here.  We can have a
long call instruction, and for the sake of the exercise I've been
trying lea(r16, dest); blr(r16) .  The back end has to be changed in
quite a few places, but the real problem is that the resulting call
site is not MT-safe: it can't be patched atomically.  To make that
work we'd have to move the destination address into the constant pool.

So, I'm envisaging a solution where we wait until, when patching, we
have the first branch out of range.  We then patch the site with a
trap that calls deoptimize.  We also set a flag in the assembler.  When
the method is recompiled after deoptimization it'll have long
branches, and from that time onwards all (inter-method) branches will
be long.

A gnarly problem is nmethod::make_not_entrant_or_zombie().  At present
we place a single NOP at the entry point of every method, and when we
deoptimize it we patch it with a branch to handle_wrong_method.  We
can do this iff handle_wrong_method is reachable, i.e. less than
128Mbytes away.  We could deposit a copy of the handle_wrong_method
stub every 128Mbytes or so, but a better plan is to replace our NOP
with a trap of some kind.  DCPS1 generates an illegal instruction
trap, so I've used that.  It seems to work.

None of this is very nice, but it all works. It shouldn't affect
anything significant in most cases because our branch overflow will
never happen, and we'll not do anything different.

Thoughts?

Andrew.
-------------- next part --------------
diff -r 970ff006b665 src/cpu/aarch64/vm/aarch64.ad

--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Fri Jan 03 12:50:14 2014 +0000
@@ -749,7 +749,7 @@
   // call should be a simple bl
   // unless this is a method handle invoke in which case it is
   // mov(rfp, sp), bl, mov(sp, rfp)
-  int off = 4;
+  int off = MacroAssembler::ret_addr_offset();
   if (_method_handle_invoke) {
     off += 4;
   }
@@ -766,7 +766,7 @@
   // or
   //   adrp // if !NearCPool
   //   ldr
-  int off = 8;
+  int off = MacroAssembler::ret_addr_offset() + 4;
   if (!NearCpool) {
     off += 4;
   }
@@ -781,7 +781,7 @@
   //   blrt rscratch1
   CodeBlob *cb = CodeCache::find_blob(_entry_point);
   if (cb) {
-    return 4;
+    return MacroAssembler::ret_addr_offset();
   } else {
     return 20;
   }
@@ -1434,7 +1434,8 @@
   // TODO
   // can we avoid this skip and still use a reloc?
   __ br(Assembler::EQ, skip);
-  __ b(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+  __ lea(rscratch1, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+  __ br(rscratch1);
   __ bind(skip);
 }
 
@@ -1465,7 +1466,8 @@
   __ start_a_stub(size_exception_handler());
   if (base == NULL)  return 0;  // CodeBuffer::expand failed
   int offset = __ offset();
-  __ b(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
+  __ lea(rscratch1, RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
+  __ br(rscratch1);
   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
   __ end_a_stub();
   return offset;
@@ -1473,8 +1475,8 @@
 
 uint size_deopt_handler()
 {
-  // count one adr and one branch instruction
-  return 2 * NativeInstruction::instruction_size;
+  // count one adr, one lea, and one branch instruction
+  return 6 * NativeInstruction::instruction_size;
 }
 
 // Emit deopt handler code.
@@ -1489,8 +1491,8 @@
   int offset = __ offset();
 
   __ adr(lr, __ pc());
-  // should we load this into rscratch1 and use a br?
-  __ b(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
+  __ lea(rscratch1, RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
+  __ br(rscratch1);
 
   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
   __ end_a_stub();
@@ -2880,6 +2882,13 @@
       // Emit stub for static call
       CompiledStaticCall::emit_to_interp_stub(cbuf, mark);
     }
+    {
+      Label over;
+      __ b(over);
+      for (int i = 0; i < 512; i++)
+    	__ nop();
+      __ bind(over);
+    }
   %}
 
   enc_class aarch64_enc_java_handle_call(method meth) %{
@@ -2952,12 +2961,15 @@
 
   enc_class aarch64_enc_rethrow() %{
     MacroAssembler _masm(&cbuf);
-    __ b(RuntimeAddress(OptoRuntime::rethrow_stub()));
+    __ lea(rscratch1, RuntimeAddress(OptoRuntime::rethrow_stub()));
+    __ br(rscratch1);
   %}
 
   enc_class aarch64_enc_ret() %{
     MacroAssembler _masm(&cbuf);
     __ ret(lr);
+  for (int i = 0; i < 512; i++)
+    __ nop();
   %}
 
   enc_class aarch64_enc_tail_call(iRegP jump_target) %{
diff -r 970ff006b665 src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Fri Jan 03 12:50:14 2014 +0000
@@ -116,7 +116,7 @@
 // current method -- must be in a call-clobbered register
 REGISTER_DECLARATION(Register, rmethod,   r12);
 
-// non-volatile (callee-save) registers are r16-29
+// non-volatile (callee-save) registers are r19-29
 // of which the following are dedicated global state
 
 // link register
diff -r 970ff006b665 src/cpu/aarch64/vm/icBuffer_aarch64.cpp
--- a/src/cpu/aarch64/vm/icBuffer_aarch64.cpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/icBuffer_aarch64.cpp	Fri Jan 03 12:50:14 2014 +0000
@@ -36,7 +36,7 @@
 #include "oops/oop.inline2.hpp"
 
 int InlineCacheBuffer::ic_stub_code_size() {
-  return NativeInstruction::instruction_size * 5;
+  return 4 * 7;
 }
 
 
@@ -50,25 +50,27 @@
   // (2) these ICStubs are removed *before* a GC happens, so the roots disappear
   // assert(cached_value == NULL || cached_oop->is_perm(), "must be perm oop");
 
-  Label l;
-  masm->ldr(rscratch2, l);
-  masm->b(ExternalAddress(entry_point));
-  masm->bind(l);
+  Label l1, l2;
+  masm->ldr(rscratch2, l1);
+  masm->ldr(r19, l2);
+  masm->br(r19);
+  masm->bind(l1);
   masm->emit_int64((int64_t)cached_value);
+  masm->bind(l2);
+  masm->emit_int64((int64_t)entry_point);
   // Only need to invalidate the 1st two instructions - not the whole ic stub
-  ICache::invalidate_range(code_begin, NativeInstruction::instruction_size * 2);
+  ICache::invalidate_range(code_begin, NativeInstruction::instruction_size * 3);
 }
 
 address InlineCacheBuffer::ic_buffer_entry_point(address code_begin) {
-  NativeMovConstReg* move = nativeMovConstReg_at(code_begin);   // creation also verifies the object
-  NativeJump* jump = nativeJump_at(code_begin + 4);
-  return jump->jump_destination();
+  NativeMovConstReg* move = nativeMovConstReg_at(code_begin + 4);   // creation also verifies the object
+  return (address)move->data();
 }
 
 
 void* InlineCacheBuffer::ic_buffer_cached_value(address code_begin) {
   // creation also verifies the object
-  uintptr_t *p = (uintptr_t *)(code_begin + 8);
+  uintptr_t *p = (uintptr_t *)(code_begin + 12);
   void* o = (void*)*p;
   return o;
 }
diff -r 970ff006b665 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Fri Jan 03 12:50:14 2014 +0000
@@ -63,6 +63,8 @@
 
 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 
+bool MacroAssembler::_far_branches;
+
 void MacroAssembler::pd_patch_instruction(address branch, address target) {
   long offset = (target - branch) >> 2;
   unsigned insn = *(unsigned*)branch;
diff -r 970ff006b665 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri Jan 03 12:50:14 2014 +0000
@@ -28,6 +28,7 @@
 #define CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_HPP
 
 #include "asm/assembler.hpp"
+#include "code/codeCache.hpp"
 
 // MacroAssembler extends Assembler by frequently used macros.
 //
@@ -89,8 +90,8 @@
 
   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
 
-  // Maximum size of class area in Metaspace when compressed
-  uint64_t use_XOR_for_compressed_class_base;
+  bool use_XOR_for_compressed_class_base;
+  static bool _far_branches;
 
  public:
   MacroAssembler(CodeBuffer* code) : Assembler(code) {
@@ -99,6 +100,14 @@
 					     (uint64_t)Universe::narrow_klass_base())
 	 && ((uint64_t)Universe::narrow_klass_base()
 	     > (1u << log2_intptr(CompressedClassSpaceSize))));
+    _far_branches = ReservedCodeCacheSize > 6*1024*1024;
+  }
+
+  static int ret_addr_offset() {
+    if (_far_branches)
+      return 5 * sizeof (uint32_t);
+    else
+      return sizeof (uint32_t);
   }
 
   // Biased locking support
@@ -195,6 +204,31 @@
     }
   }
 
+// Redefine branch instructions to use far variants.  r16 is used here
+// because it's reserved for this purpose by the system ABI.
+#define BRANCH(INSN)                                            \
+  void INSN(const Address &dest) {                              \
+    if (_far_branches) {                                        \
+      lea(r16, dest);                                           \
+      INSN##r(r16);                                             \
+    } else {                                                    \
+      InstructionMark im(this);                                 \
+      code_section()->relocate(inst_mark(), dest.rspec());      \
+      Assembler::INSN(dest.target());                           \
+    }                                                           \
+  }                                                             \
+  void INSN(Label &l) {                                         \
+    Assembler::INSN(l);                                         \
+  }                                                             \
+  void INSN(address a) {                                        \
+    Assembler::INSN(a);                                         \
+  }
+
+BRANCH(bl)
+BRANCH(b)
+
+#undef BRANCH
+
   inline void moviw(Register Rd, unsigned imm) { orrw(Rd, zr, imm); }
   inline void movi(Register Rd, unsigned imm) { orr(Rd, zr, imm); }
 
diff -r 970ff006b665 src/cpu/aarch64/vm/nativeInst_aarch64.cpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Fri Jan 03 12:50:14 2014 +0000
@@ -45,7 +45,11 @@
 void NativeCall::verify() { ; }
 
 address NativeCall::destination() const {
-  return instruction_address() + displacement();
+  return MacroAssembler::target_addr_for_insn(instruction_address(), int_at(instruction_offset));
+}
+
+void NativeCall::set_destination(address dest)       {
+  MacroAssembler::pd_patch_instruction(instruction_address(), dest);
 }
 
 void NativeCall::print() { Unimplemented(); }
@@ -208,17 +212,16 @@
 }
 
 // MT safe inserting of a jump over an unknown instruction sequence (used by nmethod::makeZombie)
+// The problem: jump_to <dest> may be an N-word instruction.
+// Atomic write can be only with 1 word.
 
 void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
-  ptrdiff_t disp = dest - verified_entry;
-  guarantee(disp < 1 << 27 && disp > - (1 << 27), "branch overflow");
-
-  unsigned int insn = (0b000101 << 26) | ((disp >> 2) & 0x3ffffff);
-
-  assert(nativeInstruction_at(verified_entry)->is_jump_or_nop(),
-	 "Aarch64 cannot replace non-jump with jump");
-  *(unsigned int*)verified_entry = insn;
-  ICache::invalidate_range(verified_entry, instruction_size);
+  ResourceMark rm;
+  int code_size = 1 * 4;
+  CodeBuffer cb(verified_entry, code_size + 1);
+  MacroAssembler* a = new MacroAssembler(&cb);
+  a->dpcs1(0); // "dpcs1" must agree with code in the signal handler
+  ICache::invalidate_range(verified_entry, code_size);
 }
 
 
diff -r 970ff006b665 src/cpu/aarch64/vm/nativeInst_aarch64.hpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Fri Jan 03 12:50:14 2014 +0000
@@ -102,6 +102,10 @@
   static bool maybe_cpool_ref(address instr) {
     return is_adrp_at(instr) || is_ldr_literal_at(instr);
   }
+
+  bool is_zombie() {
+    return int_at(0) == (int)0xd4a00001; // DCPS1
+  }
 };
 
 inline NativeInstruction* nativeInstruction_at(address address) {
@@ -113,8 +117,9 @@
 }
 
 inline NativeCall* nativeCall_at(address address);
-// The NativeCall is an abstraction for accessing/manipulating native call imm32/rel32off
-// instructions (used to manipulate inline caches, primitive & dll calls, etc.).
+// The NativeCall is an abstraction for accessing/manipulating native
+// call instructions (used to manipulate inline caches, primitive &
+// DSO calls, etc.).
 
 class NativeCall: public NativeInstruction {
  public:
@@ -122,25 +127,24 @@
     instruction_size            =    4,
     instruction_offset          =    0,
     displacement_offset         =    0,
-    return_address_offset       =    4
+    bl_address_offset           =    4
   };
 
   enum { cache_line_size = BytesPerWord };  // conservative estimate!
+  int return_address_offset() const {
+    uint32_t insn = *(uint32_t*)instruction_address();
+    if ((insn >> 26) == 0b100101) // BL (offset)
+      return bl_address_offset;
+    else // 4 mov# insns; blr
+      return bl_address_offset + 4 * sizeof (uint32_t);
+  }
   address instruction_address() const       { return addr_at(instruction_offset); }
-  address next_instruction_address() const  { return addr_at(return_address_offset); }
+  address next_instruction_address() const  { return addr_at(return_address_offset()); }
   int   displacement() const                { return (int_at(displacement_offset) << 6) >> 4; }
   address displacement_address() const      { return addr_at(displacement_offset); }
-  address return_address() const            { return addr_at(return_address_offset); }
+  address return_address() const            { return addr_at(return_address_offset()); }
   address destination() const;
-  void  set_destination(address dest)       {
-    int offset = dest - instruction_address();
-    unsigned int insn = 0b100101 << 26;
-    assert((offset & 3) == 0, "should be");
-    offset >>= 2;
-    offset &= (1 << 26) - 1; // mask off insn part
-    insn |= offset;
-    set_int_at(displacement_offset, insn);
-  }
+  void  set_destination(address dest);
 
   // Similar to replace_mt_safe, but just changes the destination.  The
   // important thing is that free-running threads are able to execute
@@ -169,7 +173,7 @@
   }
 
   static bool is_call_before(address return_address) {
-    return is_call_at(return_address - NativeCall::return_address_offset);
+    return is_call_at(return_address - NativeCall::bl_address_offset);
   }
 
   static bool is_call_to(address instr, address target) {
@@ -192,15 +196,16 @@
 }
 
 inline NativeCall* nativeCall_before(address return_address) {
-  NativeCall* call = (NativeCall*)(return_address - NativeCall::return_address_offset);
+  NativeCall* call = (NativeCall*)(return_address - NativeCall::bl_address_offset);
+  call = (NativeCall*)(return_address - call->return_address_offset());
 #ifdef ASSERT
   call->verify();
 #endif
   return call;
 }
 
-// An interface for accessing/manipulating native mov reg, imm32 instructions.
-// (used to manipulate inlined 32bit data dll calls, etc.)
+// An interface for accessing/manipulating native 64-bit movk/movz
+// reg, imm instructions.
 class NativeMovConstReg: public NativeInstruction {
  public:
   enum Aarch64_specific_constants {
@@ -260,21 +265,10 @@
     }
 };
 
-// An interface for accessing/manipulating native moves of the form:
-//      mov[b/w/l/q] [reg + offset], reg   (instruction_code_reg2mem)
-//      mov[b/w/l/q] reg, [reg+offset]     (instruction_code_mem2reg
-//      mov[s/z]x[w/b/q] [reg + offset], reg
-//      fld_s  [reg+offset]
-//      fld_d  [reg+offset]
-//      fstp_s [reg + offset]
-//      fstp_d [reg + offset]
-//      mov_literal64  scratch,<pointer> ; mov[b/w/l/q] 0(scratch),reg | mov[b/w/l/q] reg,0(scratch)
-//
+
 // Warning: These routines must be able to handle any instruction sequences
 // that are generated as a result of the load/store byte,word,long
-// macros.  For example: The load_unsigned_byte instruction generates
-// an xor reg,reg inst prior to generating the movb instruction.  This
-// class must skip the xor instruction.
+// macros.
 
 class NativeMovRegMem: public NativeInstruction {
   enum AArch64_specific_constants {
diff -r 970ff006b665 src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Fri Jan 03 12:50:14 2014 +0000
@@ -739,7 +739,6 @@
     __ ldr(rmethod, Address(holder, CompiledICHolder::holder_method_offset()));
     __ br(Assembler::EQ, ok);
     __ b(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
-
     __ bind(ok);
     // Method might have been compiled since the call site was patched to
     // interpreted; if that is the case treat it as a miss so we can get
diff -r 970ff006b665 src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp
--- a/src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp	Tue Dec 31 12:48:20 2013 +0000
+++ b/src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp	Fri Jan 03 12:50:14 2014 +0000
@@ -211,6 +211,16 @@
 extern "C" void FetchNResume () ;
 #endif
 
+inline static bool checkZombie(ucontext_t* uc, address* pc, address* stub) {
+  if (nativeInstruction_at(*pc)->is_zombie()) {
+    // zombie method (dpcs1 instruction)
+    *stub = SharedRuntime::get_handle_wrong_method_stub();
+
+    return true;
+  }
+  return false;
+}
+
 extern "C" JNIEXPORT int
 JVM_handle_linux_signal(int sig,
                         siginfo_t* info,
@@ -408,6 +418,9 @@
     }
   }
 
+  if (sig == SIGILL)
+    checkZombie(uc, &pc, &stub);
+
   if (stub != NULL) {
     // save all thread context in case we need to restore it
     if (thread != NULL) thread->set_saved_exception_pc(pc);