[aarch64-port-dev ] C1: Deoptimize when patching

Fri Sep 13 10:25:40 PDT 2013

"The ARMv8 architecture limits the set of instructions that can be
executed by one thread of execution as they are being modified by
another thread of execution without requiring explicit
synchronization.

"Concurrent modification and execution of instructions can lead to the
resulting instruction performing any behavior that can be achieved by
executing any sequence of instructions that can be executed from the
same Exception level, except where the instruction before modification
and the instruction after modification is a B, BL, NOP, BKPT, SVC,
HVC, or SMC instruction."

So, the patching we do in C1 (where e.g. an unresolved field offset is
compiled as a jump to a patching stub which replaces the jump with a
load) are not possible.

We could in theory replace a jump to a patching stub with a jump to
another stub that loads the field offset and returns.  However, I'm
not convinced that it's worth the effort right now.  We can revisit
this decision at a later date.

Instead, we deoptimize when we hit a patching stub.  Note that this
doesn't affect method calls, which replace a BL to the resolver with a
BL instruction to the method: it only affects things such as getfield
and ldc with targets that were unresolved when the method was
compiled.

I've added a flag -XX:DeoptimizeWhenPatching that can be turned off to
allow on-the-fly patching.  We can use this if we're running on
hardware that we know allows concurrent instruction modification.

Andrew.


# HG changeset patch
# User aph
# Date 1379092972 -3600
# Node ID 423577eb8f6efc66ddd7ff2b490c3702dbd4bcc3
# Parent  4afcbbbfaf7a4018766521e2b580c167c9c1f1e3
Default to DeoptimizeWhenPatching.

diff -r 4afcbbbfaf7a -r 423577eb8f6e src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp

--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Thu Sep 12 17:20:44 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Fri Sep 13 18:22:52 2013 +0100
@@ -438,6 +438,11 @@


 void C1_MacroAssembler::build_frame(int frame_size_in_bytes) {
+  // If we have to make this method not-entrant we'll overwrite its
+  // first instruction with a jump.  For this action to be legal we
+  // must ensure that this first instruction is a B, BL, NOP, BKPT,
+  // SVC, HVC, or SMC.  Make it a NOP.
+  nop();
   // Make sure there is enough stack space for this method's activation.
   // Note that we do this before doing an enter().
   generate_stack_overflow_check(frame_size_in_bytes);
diff -r 4afcbbbfaf7a -r 423577eb8f6e src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Thu Sep 12 17:20:44 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Fri Sep 13 18:22:52 2013 +0100
@@ -1193,6 +1193,35 @@
   frame runtime_frame = thread->last_frame();
   frame caller_frame = runtime_frame.sender(&reg_map);

+  if (DeoptimizeWhenPatching) {
+    // According to the ARMv8 ARM, "Concurrent modification and
+    // execution of instructions can lead to the resulting instruction
+    // performing any behavior that can be achieved by executing any
+    // sequence of instructions that can be executed from the same
+    // Exception level, except where the instruction before
+    // modification and the instruction after modification is a B, BL,
+    // NOP, BKPT, SVC, HVC, or SMC instruction."
+    //
+    // This effectively makes the games we play when patching
+    // impossible, so when we come across an access that needs
+    // patching we must deoptimize.
+
+    if (TracePatching) {
+      tty->print_cr("Deoptimizing because patch is needed");
+    }
+    // It's possible the nmethod was invalidated in the last
+    // safepoint, but if it's still alive then make it not_entrant.
+    nmethod* nm = CodeCache::find_nmethod(caller_frame.pc());
+    if (nm != NULL) {
+      nm->make_not_entrant();
+    }
+
+    Deoptimization::deoptimize_frame(thread, caller_frame.id());
+
+    // Return to the now deoptimized frame.
+    return;
+  }
+
   // last java frame on stack
   vframeStream vfst(thread, true);
   assert(!vfst.at_end(), "Java frame must exist");
diff -r 4afcbbbfaf7a -r 423577eb8f6e src/cpu/aarch64/vm/globals_aarch64.hpp
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Sep 12 17:20:44 2013 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Fri Sep 13 18:22:52 2013 +0100
@@ -81,18 +81,22 @@
 #ifdef BUILTIN_SIM
 #define UseBuiltinSim		true
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
-									\
+                                                                        \
   product(bool, NotifySimulator, UseBuiltinSim,                         \
-         "tell the AArch64 sim where we are in method code")		\
-									\
-  product(bool, UseSimulatorCache, false,				\
-         "tell sim to cache memory updates until exclusive op occurs")	\
-									\
-  product(bool, DisableBCCheck, true,                                  \
+         "tell the AArch64 sim where we are in method code")            \
+                                                                        \
+  product(bool, UseSimulatorCache, false,                               \
+         "tell sim to cache memory updates until exclusive op occurs")  \
+                                                                        \
+  product(bool, DisableBCCheck, true,                                   \
           "tell sim not to invoke bccheck callback")                    \
-									\
-  product(bool, NearCpool, true,					\
-         "constant pool is close to instructions")
+                                                                        \
+  product(bool, NearCpool, true,                                        \
+         "constant pool is close to instructions")                      \
+                                                                        \
+  product(bool, DeoptimizeWhenPatching, true,                           \
+          "doptimize instead of patching instructions")                 \
+

 #else
 #define UseBuiltinSim		false
@@ -102,7 +106,10 @@
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
 									\
   product(bool, NearCpool, true,					\
-         "constant pool is close to instructions")
+         "constant pool is close to instructions")			\
+                                                                        \
+  product(bool, DeoptimizeWhenPatching, true,                           \
+          "doptimize instead of patching instructions")
 #endif

 #endif // CPU_AARCH64_VM_GLOBALS_AARCH64_HPP
diff -r 4afcbbbfaf7a -r 423577eb8f6e src/cpu/aarch64/vm/icBuffer_aarch64.cpp
--- a/src/cpu/aarch64/vm/icBuffer_aarch64.cpp	Thu Sep 12 17:20:44 2013 +0100
+++ b/src/cpu/aarch64/vm/icBuffer_aarch64.cpp	Fri Sep 13 18:22:52 2013 +0100
@@ -57,7 +57,6 @@
   masm->emit_int64((int64_t)cached_value);
   // Only need to invalidate the 1st two instructions - not the whole ic stub
   ICache::invalidate_range(code_begin, NativeInstruction::instruction_size * 2);
-  0;
 }

 address InlineCacheBuffer::ic_buffer_entry_point(address code_begin) {
diff -r 4afcbbbfaf7a -r 423577eb8f6e src/cpu/aarch64/vm/nativeInst_aarch64.cpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Thu Sep 12 17:20:44 2013 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Fri Sep 13 18:22:52 2013 +0100
@@ -228,6 +228,8 @@

   unsigned int insn = (0b000101 << 26) | ((disp >> 2) & 0x3ffffff);

+  assert(nativeInstruction_at(verified_entry)->is_jump_or_nop(),
+	 "Aarch64 cannot replace non-jump with jump");
   *(unsigned int*)verified_entry = insn;
   ICache::invalidate_range(verified_entry, instruction_size);
 }
@@ -252,21 +254,14 @@
 }

 // MT-safe patching of a long jump instruction.
-// First patches first word of instruction to two jmp's that jmps to them
-// selfs (spinlock). Then patches the last byte, and then atomicly replaces
-// the jmp's with the first 4 byte of the new instruction.
-//
-// FIXME: I don't think that this can be done on AArch64.  The memory
-// is not coherent, so it does no matter what order we patch things
-// in.  The only way to do it AFAIK is to have:
-//
-//    ldr rscratch, 0f
-//    b rscratch
-// 0: absolute address
-//
 void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
+  assert((! DeoptimizeWhenPatching)
+	 || nativeInstruction_at(instr_addr)->is_jump_or_nop(),
+	 "Aarch64 cannot replace non-jump with jump");
   uint32_t instr = *(uint32_t*)code_buffer;
   *(uint32_t*)instr_addr = instr;
+  ICache::invalidate_range(instr_addr, instruction_size);
 }

 bool NativeInstruction::is_dtrace_trap() { return false; }
+
diff -r 4afcbbbfaf7a -r 423577eb8f6e src/cpu/aarch64/vm/nativeInst_aarch64.hpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Sep 12 17:20:44 2013 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Fri Sep 13 18:22:52 2013 +0100
@@ -55,12 +55,13 @@
   friend class Relocation;
  public:
   enum { instruction_size = BytesPerWord };
-  bool is_nop();
+  inline bool is_nop();
   bool is_dtrace_trap();
   inline bool is_call();
   inline bool is_illegal();
   inline bool is_return();
-  inline bool is_jump();
+  bool is_jump();
+  inline bool is_jump_or_nop();
   inline bool is_cond_jump();
   bool is_safepoint_poll();
   inline bool is_mov_literal64();
@@ -426,6 +427,11 @@
 inline bool NativeInstruction::is_call()         { Unimplemented(); return false; }
 inline bool NativeInstruction::is_return()       { Unimplemented(); return false; }

+inline bool NativeInstruction::is_nop()         {
+  uint32_t insn = *(uint32_t*)addr_at(0);
+  return insn == 0xd503201f;
+}
+
 inline bool NativeInstruction::is_jump() {
   uint32_t insn = *(uint32_t*)addr_at(0);

@@ -445,6 +451,10 @@
     return false;
 }

+inline bool NativeInstruction::is_jump_or_nop() {
+  return is_nop() || is_jump();
+}
+
 inline bool NativeInstruction::is_cond_jump()    { Unimplemented(); return false; }

 inline bool NativeInstruction::is_mov_literal64() { Unimplemented(); return false; }
exporting patch:
<fdopen>