[aarch64-port-dev ] Performance patches for aarch64 jdk7u hotspot

Mon Jul 21 10:22:58 UTC 2014

Attached for review are jdk7u hotspot patches for all outstanding
performance enhancements which can be merged from upstream JDK8.

n.b. some of these patches have been tweaked relative to their
corresponding original in JDK8 to elide changes which related to
upstream Oracle shared code modifications. Also, I tweaked a couple of
patches to merge an original change set with a subsequent minor correction.

Is it ok to push?

regards,


Andrew Dinn
-----------
-------------- next part --------------
# HG changeset patch
# User adinn
# Date 1405348875 -3600
# Node ID 8a6e776a1fa00060e38420a9ecbeaad3eea28b76
# Parent  a03843f2ff15e2ac97af1c0e0eec76c7819a2a1c
Performance improvement and ease of use changes pulled from upstream

These are some of the changes originally introduced in upstream jdk8
as extra edits sneaked into change set 7168 whose main purpose was to
fix AArch64 after the merge from upstream Oracle jdk8u20-b16. Other
bug fix changes also sneaked into that same change set have already
been included in previous patch to jdk7.

diff -r a03843f2ff15 -r 8a6e776a1fa0 agent/src/os/linux/LinuxDebuggerLocal.c

--- a/agent/src/os/linux/LinuxDebuggerLocal.c	Tue Jul 08 20:15:56 2014 +0100
+++ b/agent/src/os/linux/LinuxDebuggerLocal.c	Mon Jul 14 15:41:15 2014 +0100
@@ -365,7 +365,6 @@
 
 #undef REG_INDEX
 
-// ECN: FIXME - add case for aarch64
 #ifdef i386
 #define REG_INDEX(reg) sun_jvm_hotspot_debugger_x86_X86ThreadContext_##reg
 
@@ -458,6 +457,13 @@
   regs[REG_INDEX(R_O7)]  = gregs.u_regs[14];
 #endif /* sparc */
 
+#if defined(aarch64)
+
+#define REG_INDEX(reg) sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_##reg
+
+#endif /* aarch64 */
+
+
 
   (*env)->ReleaseLongArrayElements(env, array, regs, JNI_COMMIT);
   return array;
diff -r a03843f2ff15 -r 8a6e776a1fa0 make/linux/makefiles/buildtree.make
--- a/make/linux/makefiles/buildtree.make	Tue Jul 08 20:15:56 2014 +0100
+++ b/make/linux/makefiles/buildtree.make	Mon Jul 14 15:41:15 2014 +0100
@@ -193,6 +193,7 @@
 DATA_MODE/sparc = 32
 DATA_MODE/sparcv9 = 64
 DATA_MODE/amd64 = 64
+DATA_MODE/aarch64 = 64
 
 DATA_MODE = $(DATA_MODE/$(BUILDARCH))
 
diff -r a03843f2ff15 -r 8a6e776a1fa0 make/linux/makefiles/compiler1.make
--- a/make/linux/makefiles/compiler1.make	Tue Jul 08 20:15:56 2014 +0100
+++ b/make/linux/makefiles/compiler1.make	Mon Jul 14 15:41:15 2014 +0100
@@ -28,7 +28,4 @@
 
 VM_SUBDIR = client
 
-# for now don't make compiler1 if building aarch64
-#ifneq ($(SRCARCH), aarch64)
 CFLAGS += -DCOMPILER1
-#endif
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Jul 14 15:41:15 2014 +0100
@@ -2807,9 +2807,6 @@
       __ call_Unimplemented();
     }
   %}
-  // TODO
-  // this only works ifdef BUILTIN_SIM
-  // provide version for native AArch64 build
 
   enc_class aarch64_enc_java_to_runtime(method meth) %{
     MacroAssembler _masm(&cbuf);
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.hpp
--- a/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.hpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.hpp	Mon Jul 14 15:41:15 2014 +0100
@@ -96,7 +96,7 @@
 #define LOCALS_ADDR(offset)    ((address)locals[-(offset)])
 #define LOCALS_INT(offset)     ((jint)(locals[-(offset)]))
 #define LOCALS_FLOAT(offset)   (*((jfloat*)&locals[-(offset)]))
-#define LOCALS_OBJECT(offset)  ((oop)locals[-(offset)])
+#define LOCALS_OBJECT(offset)  (cast_to_oop(locals[-(offset)]))
 #define LOCALS_DOUBLE(offset)  (((VMJavaVal64*)&locals[-((offset) + 1)])->d)
 #define LOCALS_LONG(offset)    (((VMJavaVal64*)&locals[-((offset) + 1)])->l)
 #define LOCALS_LONG_AT(offset) (((address)&locals[-((offset) + 1)]))
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -515,7 +515,7 @@
   __ pop(0x3, sp);                 // r0 & r1
   __ leave();
   __ br(rscratch1);
-  address polling_page(os::get_polling_page() + (SafepointPollOffset % os::vm_page_size()));
+  address polling_page(os::get_polling_page());
   assert(os::is_poll_address(polling_page), "should be");
   unsigned long off;
   __ adrp(rscratch1, Address(polling_page, rtype), off);
@@ -534,7 +534,7 @@
   // Pop the stack before the safepoint code
   __ remove_frame(initial_frame_size_in_bytes());
   if (UseCompilerSafepoints) {
-    address polling_page(os::get_polling_page() + (SafepointPollOffset % os::vm_page_size()));
+    address polling_page(os::get_polling_page());
     __ read_polling_page(rscratch1, polling_page, relocInfo::poll_return_type);
   } else {
     poll_for_safepoint(relocInfo::poll_return_type);
@@ -543,8 +543,7 @@
 }
 
 int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
-  address polling_page(os::get_polling_page()
-		       + (SafepointPollOffset % os::vm_page_size()));
+  address polling_page(os::get_polling_page());
   if (UseCompilerSafepoints) {
     guarantee(info != NULL, "Shouldn't be NULL");
     assert(os::is_poll_address(polling_page), "should be");
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -1082,6 +1082,21 @@
       }
       break;
 
+    case deoptimize_id:
+      {
+        StubFrame f(sasm, "deoptimize", dont_gc_arguments);
+        OopMap* oop_map = save_live_registers(sasm);
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, deoptimize));
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+        restore_live_registers(sasm);
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+        __ leave();
+        __ b(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+      }
+      break;
+
     case throw_range_check_failed_id:
       { StubFrame f(sasm, "range_check_failed", dont_gc_arguments);
         oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_range_check_exception), true);
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/frame_aarch64.inline.hpp
--- a/src/cpu/aarch64/vm/frame_aarch64.inline.hpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/frame_aarch64.inline.hpp	Mon Jul 14 15:41:15 2014 +0100
@@ -47,10 +47,12 @@
 inline frame::frame(intptr_t* sp, intptr_t* fp, address pc) {
   intptr_t a = intptr_t(sp);
   intptr_t b = intptr_t(fp);
+#ifndef PRODUCT
   if (fp)
     if (sp > fp || (fp - sp > 0x100000))
       for(;;)
 	asm("nop");
+#endif
   _sp = sp;
   _unextended_sp = sp;
   _fp = fp;
@@ -71,10 +73,12 @@
 inline frame::frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc) {
   intptr_t a = intptr_t(sp);
   intptr_t b = intptr_t(fp);
+#ifndef PRODUCT
   if (fp) 
     if (sp > fp || (fp - sp > 0x100000))
       for(;;)
 	asm("nop");
+#endif
   _sp = sp;
   _unextended_sp = unextended_sp;
   _fp = fp;
@@ -96,10 +100,12 @@
 inline frame::frame(intptr_t* sp, intptr_t* fp) {
   intptr_t a = intptr_t(sp);
   intptr_t b = intptr_t(fp);
+#ifndef PRODUCT
   if (fp)
     if (sp > fp || (fp - sp > 0x100000))
       for(;;)
 	asm("nop");
+#endif
   _sp = sp;
   _unextended_sp = sp;
   _fp = fp;
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/interp_masm_aarch64.cpp
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -1417,10 +1417,10 @@
                                                         Register scratch, bool preloaded,
                                                         Condition cond, Label* where) {
   if (!preloaded) {
-    ldr(scratch, counter_addr);
+    ldrw(scratch, counter_addr);
   }
   add(scratch, scratch, increment);
-  str(scratch, counter_addr);
+  strw(scratch, counter_addr);
   ands(scratch, scratch, mask);
   br(cond, *where);
 }
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp
--- a/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Red Hat Inc.
+ * Copyright (c) 2014, Red Hat Inc.
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates.
  * All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -35,39 +35,136 @@
 
 #define BUFFER_SIZE 30*wordSize
 
-// Instead of issuing lfence for LoadLoad barrier, we create data dependency
-// between loads, which is more efficient than lfence.
+// Instead of issuing a LoadLoad barrier we create an address
+// dependency between loads; this might be more efficient.
+ 
 
 // Common register usage:
-// rax/xmm0: result
+// r0/v0:      result
 // c_rarg0:    jni env
 // c_rarg1:    obj
 // c_rarg2:    jfield id
 
-// static const Register robj          = r9;
-// static const Register rcounter      = r10;
-// static const Register roffset       = r11;
-// static const Register rcounter_addr = r11;
+static const Register robj          = r3;
+static const Register rcounter      = r4;
+static const Register roffset       = r5;
+static const Register rcounter_addr = r6;
+static const Register result        = r7;
 
-// Warning: do not use rip relative addressing after the first counter load
-// since that may scratch r10!
+address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
+  const char *name;
+  switch (type) {
+    case T_BOOLEAN: name = "jni_fast_GetBooleanField"; break;
+    case T_BYTE:    name = "jni_fast_GetByteField";    break;
+    case T_CHAR:    name = "jni_fast_GetCharField";    break;
+    case T_SHORT:   name = "jni_fast_GetShortField";   break;
+    case T_INT:     name = "jni_fast_GetIntField";     break;
+    case T_LONG:    name = "jni_fast_GetLongField";    break;
+    case T_FLOAT:   name = "jni_fast_GetFloatField";   break;
+    case T_DOUBLE:  name = "jni_fast_GetDoubleField";  break;
+    default:        ShouldNotReachHere();
+  }
+  ResourceMark rm;
+  BufferBlob* blob = BufferBlob::create(name, BUFFER_SIZE);
+  CodeBuffer cbuf(blob);
+  MacroAssembler* masm = new MacroAssembler(&cbuf);
+  address fast_entry = __ pc();
 
-address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) { Unimplemented(); return 0; }
+  Label slow;
 
-address JNI_FastGetField::generate_fast_get_boolean_field() { Unimplemented(); return 0; }
+  unsigned long offset;
+  __ adrp(rcounter_addr,
+	  SafepointSynchronize::safepoint_counter_addr(), offset);
+  Address safepoint_counter_addr(rcounter_addr, offset);
+  __ ldrw(rcounter, safepoint_counter_addr);
+  __ andw(rscratch1, rcounter, 1);
+  __ cbnzw(rscratch1, slow);
+  __ eor(robj, c_rarg1, rcounter);
+  __ eor(robj, robj, rcounter);               // obj, since
+                                              // robj ^ rcounter ^ rcounter == robj
+                                              // robj is address dependent on rcounter.
+  __ ldr(robj, Address(robj, 0));             // *obj
+  __ lsr(roffset, c_rarg2, 2);                // offset
 
-address JNI_FastGetField::generate_fast_get_byte_field() { Unimplemented(); return 0; }
+  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
+  speculative_load_pclist[count] = __ pc();   // Used by the segfault handler
+  switch (type) {
+    case T_BOOLEAN: __ ldrb    (result, Address(robj, roffset)); break;
+    case T_BYTE:    __ ldrsb   (result, Address(robj, roffset)); break;
+    case T_CHAR:    __ ldrh    (result, Address(robj, roffset)); break;
+    case T_SHORT:   __ ldrsh   (result, Address(robj, roffset)); break;
+    case T_FLOAT:   __ ldrw    (result, Address(robj, roffset)); break;
+    case T_INT:     __ ldrsw   (result, Address(robj, roffset)); break;
+    case T_DOUBLE:
+    case T_LONG:    __ ldr     (result, Address(robj, roffset)); break;
+    default:        ShouldNotReachHere();
+  }
 
-address JNI_FastGetField::generate_fast_get_char_field() { Unimplemented(); return 0; }
+  // counter_addr is address dependent on result.
+  __ eor(rcounter_addr, rcounter_addr, result);
+  __ eor(rcounter_addr, rcounter_addr, result);
+  __ ldrw(rscratch1, safepoint_counter_addr);
+  __ cmpw(rcounter, rscratch1);
+  __ br (Assembler::NE, slow);
 
-address JNI_FastGetField::generate_fast_get_short_field() { Unimplemented(); return 0; }
+  switch (type) {
+    case T_FLOAT:   __ fmovs(v0, result); break;
+    case T_DOUBLE:  __ fmovd(v0, result); break;
+    default:        __ mov(r0, result);   break;
+  }
+  __ ret(lr);
 
-address JNI_FastGetField::generate_fast_get_int_field() { Unimplemented(); return 0; }
+  slowcase_entry_pclist[count++] = __ pc();
+  __ bind(slow);
+  address slow_case_addr;
+  switch (type) {
+    case T_BOOLEAN: slow_case_addr = jni_GetBooleanField_addr(); break;
+    case T_BYTE:    slow_case_addr = jni_GetByteField_addr();    break;
+    case T_CHAR:    slow_case_addr = jni_GetCharField_addr();    break;
+    case T_SHORT:   slow_case_addr = jni_GetShortField_addr();   break;
+    case T_INT:     slow_case_addr = jni_GetIntField_addr();     break;
+    case T_LONG:    slow_case_addr = jni_GetLongField_addr();    break;
+    case T_FLOAT:   slow_case_addr = jni_GetFloatField_addr();   break;
+    case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
+    default:        ShouldNotReachHere();
+  }
+  // tail call
+  __ lea(rscratch1, ExternalAddress(slow_case_addr));
+  __ br(rscratch1);
 
-address JNI_FastGetField::generate_fast_get_long_field() { Unimplemented(); return 0; }
+  __ flush ();
 
-address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) { Unimplemented(); return 0; }
+  return fast_entry;
+}
 
-address JNI_FastGetField::generate_fast_get_float_field() { Unimplemented(); return 0; }
+address JNI_FastGetField::generate_fast_get_boolean_field() {
+  return generate_fast_get_int_field0(T_BOOLEAN);
+}
 
-address JNI_FastGetField::generate_fast_get_double_field() { Unimplemented(); return 0; }
+address JNI_FastGetField::generate_fast_get_byte_field() {
+  return generate_fast_get_int_field0(T_BYTE);
+}
+
+address JNI_FastGetField::generate_fast_get_char_field() {
+  return generate_fast_get_int_field0(T_CHAR);
+}
+
+address JNI_FastGetField::generate_fast_get_short_field() {
+  return generate_fast_get_int_field0(T_SHORT);
+}
+
+address JNI_FastGetField::generate_fast_get_int_field() {
+  return generate_fast_get_int_field0(T_INT);
+}
+
+address JNI_FastGetField::generate_fast_get_long_field() {
+  return generate_fast_get_int_field0(T_LONG);
+}
+
+address JNI_FastGetField::generate_fast_get_float_field() {
+  return generate_fast_get_int_field0(T_FLOAT);
+}
+
+address JNI_FastGetField::generate_fast_get_double_field() {
+  return generate_fast_get_int_field0(T_DOUBLE);
+}
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -3079,6 +3079,7 @@
 }
 
 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
+  relocInfo::relocType rtype = dest.rspec().reloc()->type();
   if (labs(pc() - dest.target()) >= (1LL << 32)) {
     // Out of range.  This doesn't happen very often, but we have to
     // handle it
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -1924,7 +1924,7 @@
   if(os::is_MP()) {
     if (UseMembar) {
       // Force this write out before the read below
-      __ dsb(Assembler::SY);
+      __ dmb(Assembler::SY);
     } else {
       // Write serialization page so VM thread can do a pseudo remote membar.
       // We use the current thread pointer to calculate a thread specific
diff -r a03843f2ff15 -r 8a6e776a1fa0 src/share/vm/c1/c1_Runtime1.cpp
--- a/src/share/vm/c1/c1_Runtime1.cpp	Tue Jul 08 20:15:56 2014 +0100
+++ b/src/share/vm/c1/c1_Runtime1.cpp	Mon Jul 14 15:41:15 2014 +0100
@@ -225,11 +225,6 @@
 #if defined(SPARC) || defined(PPC)
     case handle_exception_nofpu_id:  // Unused on sparc
 #endif
-#ifdef TARGET_ARCH_aarch64
-    case throw_index_exception_id:
-    case throw_array_store_exception_id:
-    case deoptimize_id:
-#endif
       break;
 
     // All other stubs should have oopmaps
-------------- next part --------------
# HG changeset patch
# User Edward Nevill edward.nevill at linaro.org
# Date 1402568896 -3600
# Node ID 55084fca52d279e90686b5cc53bf87aa853a3c75
# Parent  1b3757e98d39e89faa65c719951d4b273908433c
Add support for Neon implementation of CRC32

diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Jun 12 11:28:16 2014 +0100
@@ -1842,6 +1842,225 @@
       fmovd(Vn, zr);
   }
 
+/* SIMD extensions
+ *
+ * We just use FloatRegister in the following. They are exactly the same
+ * as SIMD registers.
+ */
+public:
+
+  enum SIMD_Arrangement {
+       T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
+  };
+
+  enum SIMD_RegVariant {
+       S32, D64, Q128
+  };
+
+  void v_shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+    starti;
+    /* The encodings for the immh:immb fields (bits 22:16) are
+     *   0001 xxx	8B/16B, shift = xxx
+     *   001x xxx	4H/8H,  shift = xxxx
+     *   01xx xxx	2S/4S,  shift = xxxxx
+     *   1xxx xxx	1D/2D,  shift = xxxxxx (1D is RESERVED)
+     */
+    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");
+    f(0, 31), f(T & 1, 30), f(0b0011110, 29, 23), f((1 << ((T>>1)+3))|shift, 22, 16);
+    f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  void v_ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+    starti;
+    /* The encodings for the immh:immb fields (bits 22:16) are
+     *   0001 xxx	8H, 8B/16b shift = xxx
+     *   001x xxx	4S, 4H/8H  shift = xxxx
+     *   01xx xxx	2D, 2S/4S  shift = xxxxx
+     *   1xxx xxx	RESERVED
+     */
+    assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
+    assert((1 << ((Tb>>1)+3)) > shift, "Invalid shift value");
+    f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
+    f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+  void v_ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    v_ushll(Vd, Ta, Vn, Tb, shift);
+  }
+
+  void v_uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
+    starti;
+    f(0, 31), f((T & 0x1), 30), f(0b001110, 29, 24), f((T >> 1), 23, 22), f(0, 21);
+    rf(Vm, 16), f(0, 15), f(op, 14), f(0b0110, 13, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+  void v_uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
+    v_uzp1(Vd, Vn, Vm, T, 1);
+  }
+ 
+  // Move from general purpose register
+  //   mov  Vd.T[index], Rn
+  void v_mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
+    starti;
+    f(0b01001110000, 31, 21), f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16); 
+    f(0b000111, 15, 10), rf(Xn, 5), rf(Vd, 0);
+  }
+
+  // Move to general purpose register
+  //   mov  Rd, Vn.T[index]
+  void v_mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
+    starti;
+    f(0, 31), f((T >= T1D) ? 1:0, 30), f(0b001110000, 29, 21);
+    f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+    f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0);
+  }
+
+  // We do not handle the 1Q arrangement.
+  void v_pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    starti;
+    assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+  void v_pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    v_pmull(Vd, Ta, Vn, Vm, Tb);
+  }
+
+  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+    starti;
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((8 << ((int)T & 1)) == imm, "size/imm mismatch");      
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((16 << ((int)T & 1)) == imm, "size/imm mismatch");     
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((24 << ((int)T & 1)) == imm, "size/imm mismatch");
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((32 << ((int)T & 1)) == imm, "size/imm mismatch");
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_st1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_st1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }  
+  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b001101010000001100, 29, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b001101110, 29, 21), rf(Xm, 16);
+    f(0b1100, 15, 12), f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((1 << ((int)T & 3)) == imm, "size/imm mismatch");
+    f(0, 31), f((int)T & 1, 30), f(0b001101110111111100, 29, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_eor(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) {
+    starti;
+    assert(T == T8B || T == T16B, "must be T8B or T16B");
+    f(0, 31), f((int)T & 1, 30), f(0b101110001, 29, 21);
+    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+
+
 /* Simulator extensions to the ISA
 
    haltsim
diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/globals_aarch64.hpp
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jun 12 11:28:16 2014 +0100
@@ -100,6 +100,8 @@
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
 	     "Use acquire and release insns for volatile fields")
 
+// Don't attempt to use Neon on builtin sim until builtin sim supports it
+#define UseNeon false
 
 #else
 #define UseBuiltinSim		false
@@ -115,7 +117,9 @@
           "doptimize instead of patching instructions")			\
 									\
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
-	     "Use acquire and release insns for volatile fields")
+	     "Use acquire and release insns for volatile fields")       \
+  product(bool, UseNeon, false,                                         \
+          "Use Neon for CRC32 computation")
 
 #endif
 
diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jun 12 11:28:16 2014 +0100
@@ -2152,14 +2152,151 @@
 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
         Register table0, Register table1, Register table2, Register table3,
         Register tmp, Register tmp2, Register tmp3) {
-  Label L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
+  Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
   unsigned long offset;
+
     ornw(crc, zr, crc);
     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
     if (offset) add(table0, table0, offset);
     add(table1, table0, 1*256*sizeof(juint));
     add(table2, table0, 2*256*sizeof(juint));
     add(table3, table0, 3*256*sizeof(juint));
+
+  if (UseNeon) {
+      cmp(len, 64);
+      br(Assembler::LT, L_by16);
+      v_eor(v16, T16B, v16, v16);
+
+    Label L_fold;
+
+      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
+
+      v_ld1(v0, v1, T2D, buf, 32);
+      v_ld1r(v4, T2D, tmp, 8);
+      v_ld1r(v5, T2D, tmp, 8);
+      v_ld1r(v6, T2D, tmp, 8);
+      v_ld1r(v7, T2D, tmp, 8);
+      v_mov(v16, T4S, 0, crc);
+
+      v_eor(v0, T16B, v0, v16);
+      sub(len, len, 64);
+
+    BIND(L_fold);
+      v_pmull(v22, T8H, v0, v5, T8B);
+      v_pmull(v20, T8H, v0, v7, T8B);
+      v_pmull(v23, T8H, v0, v4, T8B);
+      v_pmull(v21, T8H, v0, v6, T8B);
+    
+      v_pmull2(v18, T8H, v0, v5, T16B);
+      v_pmull2(v16, T8H, v0, v7, T16B);
+      v_pmull2(v19, T8H, v0, v4, T16B);
+      v_pmull2(v17, T8H, v0, v6, T16B);
+    
+      v_uzp1(v24, v20, v22, T8H);
+      v_uzp2(v25, v20, v22, T8H);
+      v_eor(v20, T16B, v24, v25);
+    
+      v_uzp1(v26, v16, v18, T8H);
+      v_uzp2(v27, v16, v18, T8H);
+      v_eor(v16, T16B, v26, v27);
+    
+      v_ushll2(v22, T4S, v20, T8H, 8);
+      v_ushll(v20, T4S, v20, T4H, 8);
+    
+      v_ushll2(v18, T4S, v16, T8H, 8);
+      v_ushll(v16, T4S, v16, T4H, 8);
+    
+      v_eor(v22, T16B, v23, v22);
+      v_eor(v18, T16B, v19, v18);
+      v_eor(v20, T16B, v21, v20);
+      v_eor(v16, T16B, v17, v16);
+    
+      v_uzp1(v17, v16, v20, T2D);
+      v_uzp2(v21, v16, v20, T2D);
+      v_eor(v17, T16B, v17, v21);
+    
+      v_ushll2(v20, T2D, v17, T4S, 16);
+      v_ushll(v16, T2D, v17, T2S, 16);
+    
+      v_eor(v20, T16B, v20, v22);
+      v_eor(v16, T16B, v16, v18);
+    
+      v_uzp1(v17, v20, v16, T2D);
+      v_uzp2(v21, v20, v16, T2D);
+      v_eor(v28, T16B, v17, v21);
+    
+      v_pmull(v22, T8H, v1, v5, T8B);
+      v_pmull(v20, T8H, v1, v7, T8B);
+      v_pmull(v23, T8H, v1, v4, T8B);
+      v_pmull(v21, T8H, v1, v6, T8B);
+    
+      v_pmull2(v18, T8H, v1, v5, T16B);
+      v_pmull2(v16, T8H, v1, v7, T16B);
+      v_pmull2(v19, T8H, v1, v4, T16B);
+      v_pmull2(v17, T8H, v1, v6, T16B);
+    
+      v_ld1(v0, v1, T2D, buf, 32);
+    
+      v_uzp1(v24, v20, v22, T8H);
+      v_uzp2(v25, v20, v22, T8H);
+      v_eor(v20, T16B, v24, v25);
+    
+      v_uzp1(v26, v16, v18, T8H);
+      v_uzp2(v27, v16, v18, T8H);
+      v_eor(v16, T16B, v26, v27);
+    
+      v_ushll2(v22, T4S, v20, T8H, 8);
+      v_ushll(v20, T4S, v20, T4H, 8);
+    
+      v_ushll2(v18, T4S, v16, T8H, 8);
+      v_ushll(v16, T4S, v16, T4H, 8);
+    
+      v_eor(v22, T16B, v23, v22);
+      v_eor(v18, T16B, v19, v18);
+      v_eor(v20, T16B, v21, v20);
+      v_eor(v16, T16B, v17, v16);
+    
+      v_uzp1(v17, v16, v20, T2D);
+      v_uzp2(v21, v16, v20, T2D);
+      v_eor(v16, T16B, v17, v21);
+    
+      v_ushll2(v20, T2D, v16, T4S, 16);
+      v_ushll(v16, T2D, v16, T2S, 16);
+    
+      v_eor(v20, T16B, v22, v20);
+      v_eor(v16, T16B, v16, v18);
+    
+      v_uzp1(v17, v20, v16, T2D);
+      v_uzp2(v21, v20, v16, T2D);
+      v_eor(v20, T16B, v17, v21);
+    
+      v_shl(v16, v28, T2D, 1);
+      v_shl(v17, v20, T2D, 1);
+    
+      v_eor(v0, T16B, v0, v16);
+      v_eor(v1, T16B, v1, v17);
+
+      subs(len, len, 32);
+      br(Assembler::GE, L_fold);
+
+      mov(crc, 0);
+      v_mov(tmp, v0, T1D, 0);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      v_mov(tmp, v0, T1D, 1);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      v_mov(tmp, v1, T1D, 0);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      v_mov(tmp, v1, T1D, 1);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+
+      add(len, len, 32);
+  }
+
+  BIND(L_by16);
     subs(len, len, 16);
     br(Assembler::GE, L_by16_loop);
     adds(len, len, 16-4);
diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/stubRoutines_aarch64.cpp
--- a/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Thu Jun 12 11:28:16 2014 +0100
@@ -265,5 +265,12 @@
     0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
     0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
     0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
-    0xde0506f1UL
+    0xde0506f1UL,
+    // Constants for Neon CRC232 implementation
+    // k3 = 0x78ED02D5 = x^288 mod poly - bit reversed
+    // k4 = 0xED627DAE = x^256 mod poly - bit reversed
+    0x78ED02D5UL, 0xED627DAEUL,		// k4:k3
+    0xED78D502UL, 0x62EDAE7DUL,		// byte swap
+    0x02D578EDUL, 0x7DAEED62UL,		// word swap
+    0xD502ED78UL, 0xAE7D62EDUL,		// byte swap of word swap
 };
-------------- next part --------------
# HG changeset patch
# User Edward Nevill edward.nevill at linaro.org
# Date 1402950043 -3600
# Node ID ca4f6b4fdf4cb9bfee38eade22b6fff1407c5825
# Parent  55084fca52d279e90686b5cc53bf87aa853a3c75
Add support for builtin crc32 instructions

diff -r 55084fca52d2 -r ca4f6b4fdf4c src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Jun 12 11:28:16 2014 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Jun 16 21:20:43 2014 +0100
@@ -2059,6 +2059,20 @@
     rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);
   }
 
+  // CRC32 instructions
+#define INSN(NAME, sf, sz)                                                \
+  void NAME(Register Rd, Register Rn, Register Rm) {                      \
+    starti;                                                               \
+    f(sf, 31), f(0b0011010110, 30, 21), f(0b0100, 15, 12), f(sz, 11, 10); \
+    rf(Rm, 16), rf(Rn, 5), rf(Rd, 0);                                     \
+  }
+
+  INSN(crc32b, 0, 0b00);
+  INSN(crc32h, 0, 0b01);
+  INSN(crc32w, 0, 0b10);
+  INSN(crc32x, 1, 0b11);
+
+#undef INSN
 
 
 /* Simulator extensions to the ISA
diff -r 55084fca52d2 -r ca4f6b4fdf4c src/cpu/aarch64/vm/globals_aarch64.hpp
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jun 12 11:28:16 2014 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Mon Jun 16 21:20:43 2014 +0100
@@ -102,6 +102,7 @@
 
 // Don't attempt to use Neon on builtin sim until builtin sim supports it
 #define UseNeon false
+#define UseCRC32 false
 
 #else
 #define UseBuiltinSim		false
@@ -119,7 +120,9 @@
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
 	     "Use acquire and release insns for volatile fields")       \
   product(bool, UseNeon, false,                                         \
-          "Use Neon for CRC32 computation")
+          "Use Neon for CRC32 computation")                             \
+  product(bool, UseCRC32, false,                                        \
+          "Use CRC32 instructions for CRC32 computation")
 
 #endif
 
diff -r 55084fca52d2 -r ca4f6b4fdf4c src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jun 12 11:28:16 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Jun 16 21:20:43 2014 +0100
@@ -2156,6 +2156,57 @@
   unsigned long offset;
 
     ornw(crc, zr, crc);
+
+  if (UseCRC32) {
+    Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
+
+      subs(len, len, 64);
+      br(Assembler::GE, CRC_by64_loop);
+      adds(len, len, 64-4);
+      br(Assembler::GE, CRC_by4_loop);
+      adds(len, len, 4);
+      br(Assembler::GT, CRC_by1_loop);
+      b(L_exit);
+
+    BIND(CRC_by4_loop);
+      ldrw(tmp, Address(post(buf, 4)));
+      subs(len, len, 4);
+      crc32w(crc, crc, tmp);
+      br(Assembler::GE, CRC_by4_loop);
+      adds(len, len, 4);
+      br(Assembler::LE, L_exit);
+    BIND(CRC_by1_loop);
+      ldrb(tmp, Address(post(buf, 1)));
+      subs(len, len, 1);
+      crc32b(crc, crc, tmp);
+      br(Assembler::GT, CRC_by1_loop);
+      b(L_exit);
+
+      align(CodeEntryAlignment);
+    BIND(CRC_by64_loop);
+      subs(len, len, 64);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      br(Assembler::GE, CRC_by64_loop);
+      adds(len, len, 64-4);
+      br(Assembler::GE, CRC_by4_loop);
+      adds(len, len, 4);
+      br(Assembler::GT, CRC_by1_loop);
+    BIND(L_exit);
+      ornw(crc, zr, crc);
+      return;
+  }
+
     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
     if (offset) add(table0, table0, offset);
     add(table1, table0, 1*256*sizeof(juint));
diff -r 55084fca52d2 -r ca4f6b4fdf4c src/cpu/aarch64/vm/vm_version_aarch64.cpp
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Jun 12 11:28:16 2014 +0100
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Jun 16 21:20:43 2014 +0100
@@ -35,6 +35,16 @@
 # include "os_linux.inline.hpp"
 #endif
 
+#ifndef BUILTIN_SIM
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1<<7)
+#endif
+
+#endif
+
 int VM_Version::_cpu;
 int VM_Version::_model;
 int VM_Version::_stepping;
@@ -92,6 +102,16 @@
   FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
   FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
 
+#ifndef BUILTIN_SIM
+  unsigned long auxv = getauxval(AT_HWCAP);
+  if (FLAG_IS_DEFAULT(UseCRC32)) {
+    UseCRC32 = (auxv & HWCAP_CRC32) != 0;
+  }
+  if (UseCRC32 && (auxv & HWCAP_CRC32) == 0) {
+    warning("UseCRC32 specified, but not supported on this CPU");
+  }
+#endif
+
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
-------------- next part --------------
# HG changeset patch
# User aph
# Date 1402917903 14400
# Node ID e1af2a22237811f065ecadbcbc2113e102fc7a29
# Parent  8cb098504801769e6c53eec016a1767b0aa59c79
All address constants are 48 bits in size.

diff -r 8cb098504801 -r e1af2a222378 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Jun 05 13:48:13 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Jun 16 07:25:03 2014 -0400
@@ -864,7 +864,8 @@
   if (cb) {
     return 4;
   } else {
-    return 20;
+    // A 48-bit address.  See movptr().
+    return 16;
   }
 }
 
@@ -2099,7 +2100,6 @@
   //   movz xscratch1 0xnnnn        <-- current pc is here
   //   movk xscratch1 0xnnnn
   //   movk xscratch1 0xnnnn
-  //   movk xscratch1 0xnnnn
   //   str xscratch1, [xthread,#anchor_pc_off]
   //   mov xscratch2, sp
   //   str xscratch2, [xthread,#anchor_sp_off
@@ -2111,7 +2111,6 @@
   //   movz xscratch1 0xnnnn
   //   movk xscratch1 0xnnnn
   //   movk xscratch1 0xnnnn
-  //   movk xscratch1 0xnnnn
   //   blrt xscratch1
   //   . . .
   //
@@ -2121,18 +2120,18 @@
   // stub. we assert that nargs is < 7.
   //
   // so the offset we need to add to the pc (in 32-bit words) is
-  //   4 +        <-- load 64 bit constant return pc
+  //   3 +        <-- load 48-bit constant return pc
   //   1 +        <-- write anchor pc
   //   1 +        <-- copy sp
   //   1 +        <-- write anchor sp
   //   nargs +    <-- java stub arg count
   //   1 +        <-- extra thread arg
   // [ 1 + ]      <-- optional ret address of stub caller
-  //   4 +        <-- load 64 bit call target address
+  //   3 +        <-- load 64 bit call target address
   //   1          <-- blrt instruction
   //
-  // i.e we need to add (nargs + 13) * 4 bytes or (nargs + 14) * 4 bytes
-  // 
+  // i.e we need to add (nargs + 11) * 4 bytes or (nargs + 12) * 4 bytes
+  //
 
   enc_class aarch64_enc_save_pc() %{
     Compile* C = ra_->C;
@@ -2141,7 +2140,7 @@
     assert(nargs <= 8, "opto runtime stub has more than 8 args!");
     MacroAssembler _masm(&cbuf);
     address pc = __ pc();
-    int call_offset = (nargs + 13) * 4;
+    int call_offset = (nargs + 11) * 4;
     int field_offset = in_bytes(JavaThread::frame_anchor_offset()) +
                        in_bytes(JavaFrameAnchor::last_Java_pc_offset());
     __ mov(rscratch1, InternalAddress(pc + call_offset));
diff -r 8cb098504801 -r e1af2a222378 src/cpu/aarch64/vm/assembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Jun 05 13:48:13 2014 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
@@ -1273,7 +1273,7 @@
     if (rtype == relocInfo::none)
       __ mov(r, target());
     else
-      __ mov64(r, (uint64_t)target());
+      __ movptr(r, (uint64_t)target());
     break;
   }
   default:
diff -r 8cb098504801 -r e1af2a222378 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jun 05 13:48:13 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
@@ -65,6 +65,7 @@
 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 
 void MacroAssembler::pd_patch_instruction(address branch, address target) {
+  assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
   long offset = (target - branch) >> 2;
   unsigned insn = *(unsigned*)branch;
   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
@@ -139,10 +140,11 @@
   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
     // Move wide constant
     u_int64_t dest = (u_int64_t)target;
+    assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
+    assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
     Instruction_aarch64::patch(branch += 4, 20, 5, (dest >>= 16) & 0xffff);
     Instruction_aarch64::patch(branch += 4, 20, 5, (dest >>= 16) & 0xffff);
-    Instruction_aarch64::patch(branch += 4, 20, 5, (dest >>= 16));
   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
     // nothing to do
@@ -216,14 +218,13 @@
       ShouldNotReachHere();
     }
   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
-    // Move wide constant
-    // FIXME: We assume these instructions are movz, movk, movk, movk.
-    // We don't assert this; we should.
+    // Move address constant: movz, movk, movk.  See movptr().
     u_int32_t *insns = (u_int32_t *)insn_addr;
+    assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
+    assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 		   + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
-		   + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)
-		   + (u_int64_t(Instruction_aarch64::extract(insns[3], 20, 5)) << 48));
+		   + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
     return 0;
@@ -1246,10 +1247,14 @@
   InstructionMark im(this);
   code_section()->relocate(inst_mark(), dest.rspec());
   u_int64_t imm64 = (u_int64_t)dest.target();
-  mov64(r, imm64);
+  movptr(r, imm64);
 }
 
-void MacroAssembler::mov64(Register r, uintptr_t imm64) {
+// Move a constant pointer into r.  In AArch64 mode the virtual
+// address space is 48 bits in size, so we only need three
+// instructions to create a patchable instruction sequence that can
+// reach anywhere.
+void MacroAssembler::movptr(Register r, uintptr_t imm64) {
 #ifndef PRODUCT
   {
     char buffer[64];
@@ -1257,13 +1262,12 @@
     block_comment(buffer);
   }
 #endif
+  assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
   movz(r, imm64 & 0xffff);
   imm64 >>= 16;
   movk(r, imm64 & 0xffff, 16);
   imm64 >>= 16;
   movk(r, imm64 & 0xffff, 32);
-  imm64 >>= 16;
-  movk(r, imm64 & 0xffff, 48);
 }
 
 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
diff -r 8cb098504801 -r e1af2a222378 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jun 05 13:48:13 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Jun 16 07:25:03 2014 -0400
@@ -439,7 +439,7 @@
   }
 
   void mov(Register dst, Address a);
-  void mov64(Register r, uintptr_t imm64);
+  void movptr(Register r, uintptr_t imm64);
 
   // macro instructions for accessing and updating floating point
   // status register
diff -r 8cb098504801 -r e1af2a222378 src/cpu/aarch64/vm/nativeInst_aarch64.cpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Thu Jun 05 13:48:13 2014 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
@@ -207,6 +207,14 @@
           Instruction_aarch64::extract(insn, 4, 0) == 0b11111);
 }
 
+bool NativeInstruction::is_movz() {
+  return Instruction_aarch64::extract(int_at(0), 30, 23) == 0b10100101;
+}
+
+bool NativeInstruction::is_movk() {
+  return Instruction_aarch64::extract(int_at(0), 30, 23) == 0b11100101;
+}
+
 // MT safe inserting of a jump over an unknown instruction sequence (used by nmethod::makeZombie)
 
 void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
diff -r 8cb098504801 -r e1af2a222378 src/cpu/aarch64/vm/nativeInst_aarch64.hpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Jun 05 13:48:13 2014 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Mon Jun 16 07:25:03 2014 -0400
@@ -65,6 +65,8 @@
   inline bool is_cond_jump();
   bool is_safepoint_poll();
   inline bool is_mov_literal64();
+  bool is_movz();
+  bool is_movk();
 
  protected:
   address addr_at(int offset) const    { return address(this) + offset; }
@@ -105,11 +107,12 @@
 };
 
 inline NativeInstruction* nativeInstruction_at(address address) {
-  NativeInstruction* inst = (NativeInstruction*)address;
-#ifdef ASSERT
-  //inst->verify();
-#endif
-  return inst;
+  return (NativeInstruction*)address;
+}
+
+// The natural type of an AArch64 instruction is uint32_t
+inline NativeInstruction* nativeInstruction_at(uint32_t *address) {
+  return (NativeInstruction*)address;
 }
 
 inline NativeCall* nativeCall_at(address address);
@@ -204,19 +207,21 @@
 class NativeMovConstReg: public NativeInstruction {
  public:
   enum Aarch64_specific_constants {
-    instruction_size            =    4 * 4,
+    instruction_size            =    3 * 4, // movz, movk, movk.  See movptr().
     instruction_offset          =    0,
     displacement_offset         =    0,
   };
 
   address instruction_address() const       { return addr_at(instruction_offset); }
   address next_instruction_address() const  {
-    if (is_adrp_at(instruction_address()))
+    if (nativeInstruction_at(instruction_address())->is_movz())
+      // Assume movz, movk, movk
+      return addr_at(instruction_size);
+    else if (is_adrp_at(instruction_address()))
       return addr_at(2*4);
     else if (is_ldr_literal_at(instruction_address()))
       return(addr_at(4));
-    else
-      return addr_at(instruction_size);
+    assert(false, "Unknown instruction in NativeMovConstReg");
   }
 
   intptr_t data() const;
-------------- next part --------------
# HG changeset patch
# User adinn
# Date 1405428701 -3600
# Node ID df067bdbe075d945fa818c5b92b3054bf845f7a2
# Parent  c6a46cb37f857ff67e38f2ecc008760492048204
Use 2- and 3-instruction immediate form of movoop and mov_metadata in C2-generated code.
Fix patching code to handle 2- and 3-word forms.
Use lea (rather than mov) in mov(reg, Address) form.

diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 15 13:51:41 2014 +0100
@@ -2058,7 +2058,7 @@
     int call_offset = (nargs + 11) * 4;
     int field_offset = in_bytes(JavaThread::frame_anchor_offset()) +
                        in_bytes(JavaFrameAnchor::last_Java_pc_offset());
-    __ mov(rscratch1, InternalAddress(pc + call_offset));
+    __ lea(rscratch1, InternalAddress(pc + call_offset));
     __ str(rscratch1, Address(rthread, field_offset));
   %}
 
@@ -2487,9 +2487,9 @@
     } else {
       relocInfo::relocType rtype = $src->constant_reloc();
       if (rtype == relocInfo::oop_type) {
-        __ movoop(dst_reg, (jobject)con);
+        __ movoop(dst_reg, (jobject)con, /*mt_safe*/false);
       } else if (rtype == relocInfo::metadata_type) {
-        __ mov_metadata(dst_reg, (Metadata*)con);
+        __ mov_metadata(dst_reg, (Metadata*)con, /*mt_safe*/false);
       } else {
         assert(rtype == relocInfo::none, "unexpected reloc type");
         __ lea(dst_reg, Address(con, rtype));
@@ -2536,7 +2536,7 @@
     } else {
       relocInfo::relocType rtype = $src->constant_reloc();
       assert(rtype == relocInfo::oop_type, "unexpected reloc type");
-      __ set_narrow_oop(dst_reg, (jobject)con);
+      __ set_narrow_oop(dst_reg, (jobject)con, /*mt_safe*/false);
     }
   %}
 
@@ -2555,7 +2555,7 @@
     } else {
       relocInfo::relocType rtype = $src->constant_reloc();
       assert(rtype == relocInfo::metadata_type, "unexpected reloc type");
-      __ set_narrow_klass(dst_reg, (Klass *)con);      
+      __ set_narrow_klass(dst_reg, (Klass *)con, /*mt_safe*/false);      
     }
   %}
 
@@ -2823,7 +2823,7 @@
       int fpcnt;
       int rtype;
       getCallInfo(tf(), gpcnt, fpcnt, rtype);
-      __ mov(rscratch1, RuntimeAddress(entry));
+      __ lea(rscratch1, RuntimeAddress(entry));
       __ blrt(rscratch1, gpcnt, fpcnt, rtype);
     }
   %}
diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
@@ -63,7 +63,7 @@
   set_last_Java_frame(sp, rfp, retaddr, rscratch1);
 
   // do the call
-  mov(rscratch1, RuntimeAddress(entry));
+  lea(rscratch1, RuntimeAddress(entry));
   blrt(rscratch1, args_size + 1, 8, 1);
   bind(retaddr);
   int call_offset = offset();
@@ -553,7 +553,7 @@
   Label retaddr;
   __ set_last_Java_frame(sp, rfp, retaddr, rscratch1);
   // do the call
-  __ mov(rscratch1, RuntimeAddress(target));
+  __ lea(rscratch1, RuntimeAddress(target));
   __ blrt(rscratch1, 1, 0, 1);
   __ bind(retaddr);
   OopMapSet* oop_maps = new OopMapSet();
diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
@@ -137,14 +137,15 @@
     offset >>= 2;
     Instruction_aarch64::spatch(branch, 23, 5, offset);
     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
-  } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
+  } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
+    u_int64_t dest = (u_int64_t)target;
     // Move wide constant
-    u_int64_t dest = (u_int64_t)target;
     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
-    Instruction_aarch64::patch(branch += 4, 20, 5, (dest >>= 16) & 0xffff);
-    Instruction_aarch64::patch(branch += 4, 20, 5, (dest >>= 16) & 0xffff);
+    Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
+    Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
+    assert(pd_call_destination(branch) == target, "should be");
   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
     // nothing to do
@@ -154,6 +155,19 @@
   }
 }
 
+void MacroAssembler::patch_oop(address insn_addr, address o) {
+  unsigned insn = *(unsigned*)insn_addr;
+  if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
+      // Move narrow constant
+      assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
+      narrowOop n = oopDesc::encode_heap_oop((oop)o);
+      Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
+      Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
+  } else {
+    pd_patch_instruction(insn_addr, o);
+  }
+}
+
 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
   long offset = 0;
   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
@@ -218,8 +232,8 @@
       ShouldNotReachHere();
     }
   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
-    // Move address constant: movz, movk, movk.  See movptr().
     u_int32_t *insns = (u_int32_t *)insn_addr;
+    // Move wide constant: movz, movk, movk.  See movptr().
     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
@@ -2708,29 +2722,33 @@
   decode_klass_not_null(r, r);
 }
 
-// TODO
-//
-// these next two methods load a narrow oop or klass constant into a
-// register. they currently do the dumb thing of installing 64 bits of
-// unencoded constant into the register and then encoding it.
-// installing the encoded 32 bit constant directly requires updating
-// the relocation code so it can recognize that this is a 32 bit load
-// rather than a 64 bit load.
-
-void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
-  assert (UseCompressedOops, "should only be used for compressed headers");
+void  MacroAssembler::set_narrow_oop(Register dst, jobject obj, bool mt_safe) {
+  assert (UseCompressedOops, "should only be used for compressed oops");
   assert (Universe::heap() != NULL, "java heap should be initialized");
   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
-  movoop(dst, obj);
-  encode_heap_oop_not_null(dst);
+
+  int oop_index = oop_recorder()->find_index(obj);
+  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
+
+  InstructionMark im(this);
+  RelocationHolder rspec = oop_Relocation::spec(oop_index);
+  code_section()->relocate(inst_mark(), rspec);
+  movz(dst, 0xDEAD, 16);
+  movk(dst, 0xBEEF);
 }
 
-
-void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
+void  MacroAssembler::set_narrow_klass(Register dst, Klass* k, bool mt_safe) {
   assert (UseCompressedClassPointers, "should only be used for compressed headers");
   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
-  mov_metadata(dst, k);
-  encode_klass_not_null(dst);
+  int index = oop_recorder()->find_index(k);
+  assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
+
+  InstructionMark im(this);
+  RelocationHolder rspec = metadata_Relocation::spec(index);
+  code_section()->relocate(inst_mark(), rspec);
+  narrowKlass nk = Klass::encode_klass(k);
+  movz(dst, (nk >> 16), 16);
+  movk(dst, nk & 0xffff);
 }
 
 void MacroAssembler::load_heap_oop(Register dst, Address src)
@@ -2750,7 +2768,7 @@
     decode_heap_oop_not_null(dst);
   } else {
     ldr(dst, src);
-  }  
+  }
 }
 
 void MacroAssembler::store_heap_oop(Address dst, Register src) {
@@ -2952,7 +2970,11 @@
   return Address((address)obj, rspec);
 }
 
-void MacroAssembler::movoop(Register dst, jobject obj) {
+// Move an oop into a register.  mt_safe is true iff we are not going
+// to patch this instruction while the code is being executed by
+// another thread.  In that case we can use move immediates rather
+// than the constant pool.
+void MacroAssembler::movoop(Register dst, jobject obj, bool mt_safe) {
   int oop_index;
   if (obj == NULL) {
     oop_index = oop_recorder()->allocate_oop_index(obj);
@@ -2961,7 +2983,7 @@
     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
   }
   RelocationHolder rspec = oop_Relocation::spec(oop_index);
-  address const_ptr = long_constant((jlong)obj);
+  address const_ptr = mt_safe ? long_constant((jlong)obj) : NULL;
   if (! const_ptr) {
     mov(dst, Address((address)obj, rspec));
   } else {
@@ -2970,7 +2992,8 @@
   }
 }
 
-void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
+// Move a metadata address into a register.
+void MacroAssembler::mov_metadata(Register dst, Metadata* obj, bool mt_safe) {
   int oop_index;
   if (obj == NULL) {
     oop_index = oop_recorder()->allocate_metadata_index(obj);
@@ -2978,7 +3001,7 @@
     oop_index = oop_recorder()->find_index(obj);
   }
   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
-  address const_ptr = long_constant((jlong)obj);
+  address const_ptr = mt_safe ? long_constant((jlong)obj) : NULL;
   if (! const_ptr) {
     mov(dst, Address((address)obj, rspec));
   } else {
diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jul 15 13:51:41 2014 +0100
@@ -410,6 +410,8 @@
   int push(unsigned int bitset, Register stack);
   int pop(unsigned int bitset, Register stack);
 
+  void mov(Register dst, Address a);
+
 public:
   int push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); }
   int pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); }
@@ -442,7 +444,6 @@
     mov(dst, (long)i);
   }
 
-  void mov(Register dst, Address a);
   void movptr(Register r, uintptr_t imm64);
 
   // macro instructions for accessing and updating floating point
@@ -497,6 +498,8 @@
   static void pd_print_patched_instruction(address branch);
 #endif
 
+  static void patch_oop(address insn_addr, address o);
+
   // The following 4 methods return the offset of the appropriate move instruction
 
   // Support for fast byte/short loading with zero extension (depending on particular CPU)
@@ -738,7 +741,7 @@
   void encode_heap_oop_not_null(Register dst, Register src);
   void decode_heap_oop_not_null(Register dst, Register src);
 
-  void set_narrow_oop(Register dst, jobject obj);
+  void set_narrow_oop(Register dst, jobject obj, bool mt_safe = true);
   // currently unimplemented
 #if 0
   void set_narrow_oop(Address dst, jobject obj);
@@ -751,7 +754,7 @@
   void encode_klass_not_null(Register dst, Register src);
   void decode_klass_not_null(Register dst, Register src);
 
-  void set_narrow_klass(Register dst, Klass* k);
+  void set_narrow_klass(Register dst, Klass* k, bool mt_safe = true);
   // currently unimplemented
 #if 0
   void set_narrow_klass(Address dst, Klass* k);
@@ -1107,7 +1110,7 @@
 
   // Data
 
-  void mov_metadata(Register dst, Metadata* obj);
+  void mov_metadata(Register dst, Metadata* obj, bool mt_safe = true);
   Address allocate_metadata_address(Metadata* obj);
   Address constant_oop_address(jobject obj);
   // unimplemented
@@ -1115,7 +1118,7 @@
   void pushoop(jobject obj);
 #endif
 
-  void movoop(Register dst, jobject obj);
+  void movoop(Register dst, jobject obj, bool mt_safe = true);
 
   // sign extend as need a l to ptr sized element
   void movl2ptr(Register dst, Address src) { Unimplemented(); }
diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/relocInfo_aarch64.cpp
--- a/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
@@ -33,10 +33,15 @@
 
 
 void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
-  MacroAssembler::pd_patch_instruction(addr(), x);
+  switch(type()) {
+  case relocInfo::oop_type:
+    MacroAssembler::patch_oop(addr(), x);
+    break;
+  default:
+    MacroAssembler::pd_patch_instruction(addr(), x);
+  }
 }
 
-
 address Relocation::pd_call_destination(address orig_addr) {
   if (orig_addr != NULL) {
     return MacroAssembler::pd_call_destination(orig_addr);
diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
@@ -316,7 +316,7 @@
 
   __ mov(c_rarg0, rmethod);
   __ mov(c_rarg1, lr);
-  __ mov(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
   __ blrt(rscratch1, 2, 0, 0);
 
   __ pop_CPU_state();
@@ -1166,7 +1166,7 @@
   } else {
     assert((unsigned)gpargs < 256, "eek!");
     assert((unsigned)fpargs < 32, "eek!");
-    __ mov(rscratch1, RuntimeAddress(dest));
+    __ lea(rscratch1, RuntimeAddress(dest));
     __ mov(rscratch2, (gpargs << 6) | (fpargs << 2) | type);
     __ blrt(rscratch1, rscratch2);
     // __ blrt(rscratch1, gpargs, fpargs, type);
@@ -1963,9 +1963,9 @@
   assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 #endif
     if (!is_critical_native) {
-      __ mov(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
+      __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
     } else {
-      __ mov(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
+      __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
     }
     __ blrt(rscratch1, 1, 0, 1);
     // Restore any method result value
@@ -2386,7 +2386,7 @@
   }
 #endif // ASSERT
   __ mov(c_rarg0, rthread);
-  __ mov(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
   __ blrt(rscratch1, 1, 0, 1);
   __ bind(retaddr);
 
@@ -2516,7 +2516,7 @@
 
   __ mov(c_rarg0, rthread);
   __ movw(c_rarg1, rcpool); // second arg: exec_mode
-  __ mov(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
   __ blrt(rscratch1, 2, 0, 0);
 
   // Set an oopmap for the call site
@@ -2869,7 +2869,7 @@
     __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
 
     __ mov(c_rarg0, rthread);
-    __ mov(rscratch1, RuntimeAddress(destination));
+    __ lea(rscratch1, RuntimeAddress(destination));
 
     __ blrt(rscratch1, 1, 0, 1);
     __ bind(retaddr);
diff -r c6a46cb37f85 -r df067bdbe075 src/cpu/aarch64/vm/templateTable_aarch64.cpp
--- a/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Mon Jun 16 07:25:03 2014 -0400
+++ b/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
@@ -2432,7 +2432,7 @@
     // we take the time to call into the VM.
     Label L1;
     assert_different_registers(cache, index, r0);
-    __ mov(rscratch1, ExternalAddress((address)JvmtiExport::get_field_modification_count_addr()));
+    __ lea(rscratch1, ExternalAddress((address)JvmtiExport::get_field_modification_count_addr()));
     __ ldrw(r0, Address(rscratch1));
     __ cbz(r0, L1);
 
-------------- next part --------------
# HG changeset patch
# User adinn
# Date 1405431794 -3600
# Node ID 632fea5ccac12d83ff4fb476e10d644c81bc7c35
# Parent  df067bdbe075d945fa818c5b92b3054bf845f7a2
Save intermediate state before removing C1 patching code.

diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 15 14:43:14 2014 +0100
@@ -751,19 +751,7 @@
 
 int MachCallDynamicJavaNode::ret_addr_offset()
 {
-  // call should be
-  //   ldr_constant
-  //   bl
-  // where ldr_constant is either
-  //   ldr // if NearCpool
-  // or
-  //   adrp // if !NearCPool
-  //   ldr
-  int off = 8;
-  if (!NearCpool) {
-    off += 4;
-  }
-  return off;
+  return 16; // movz, movk, movk, bl
 }
 
 int MachCallRuntimeNode::ret_addr_offset() {
@@ -2487,9 +2475,9 @@
     } else {
       relocInfo::relocType rtype = $src->constant_reloc();
       if (rtype == relocInfo::oop_type) {
-        __ movoop(dst_reg, (jobject)con, /*mt_safe*/false);
+        __ movoop(dst_reg, (jobject)con);
       } else if (rtype == relocInfo::metadata_type) {
-        __ mov_metadata(dst_reg, (Metadata*)con, /*mt_safe*/false);
+        __ mov_metadata(dst_reg, (Metadata*)con);
       } else {
         assert(rtype == relocInfo::none, "unexpected reloc type");
         __ lea(dst_reg, Address(con, rtype));
@@ -2536,7 +2524,7 @@
     } else {
       relocInfo::relocType rtype = $src->constant_reloc();
       assert(rtype == relocInfo::oop_type, "unexpected reloc type");
-      __ set_narrow_oop(dst_reg, (jobject)con, /*mt_safe*/false);
+      __ set_narrow_oop(dst_reg, (jobject)con);
     }
   %}
 
@@ -2555,7 +2543,7 @@
     } else {
       relocInfo::relocType rtype = $src->constant_reloc();
       assert(rtype == relocInfo::metadata_type, "unexpected reloc type");
-      __ set_narrow_klass(dst_reg, (Klass *)con, /*mt_safe*/false);      
+      __ set_narrow_klass(dst_reg, (Klass *)con);      
     }
   %}
 
@@ -2747,8 +2735,6 @@
     address mark = __ pc();
     address addr = (address)$meth$$method;
     if (!_method) {
-      // TODO check this
-      // think we are calling generated Java here not x86
       // A call to a runtime wrapper, e.g. new, new_typeArray_Java, uncommon_trap.
       __ bl(Address(addr, relocInfo::runtime_call_type));
     } else if (_optimized_virtual) {
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jul 15 14:43:14 2014 +0100
@@ -1243,7 +1243,7 @@
       f(size & 0b01, 31, 30), f(0b011, 29, 27), f(0b00, 25, 24);
       long offset = (adr.target() - pc()) >> 2;
       sf(offset, 23, 5);
-#ifdef ASSERT
+#if 0
       Relocation* reloc = adr.rspec().reloc();
       relocInfo::relocType rtype = (relocInfo::relocType) reloc->type();
       assert(rtype == relocInfo::internal_word_type,
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -200,7 +200,9 @@
     if (Address::offset_ok_for_immed(addr_offset, addr->scale()))
       return Address(base, addr_offset, Address::lsl(addr->scale()));
     else {
-      address const_addr = int_constant(addr_offset);
+      // This is a rather long-winded instruction sequence, but the
+      // offset is atomically patchable.  See PatchingStub::install().
+      Address const_addr = InternalAddress(int_constant(addr_offset));
       __ ldr_constant(tmp, const_addr);
       return Address(base, tmp, Address::lsl(addr->scale()));
     }
@@ -314,19 +316,7 @@
   if (o == NULL) {
     __ mov(reg, zr);
   } else {
-    int oop_index = __ oop_recorder()->find_index(o);
-    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(o)), "should be real oop");
-    RelocationHolder rspec = oop_Relocation::spec(oop_index);
-    address const_ptr = int_constant(jlong(o));
-    __ code()->consts()->relocate(const_ptr, rspec);
-    __ ldr_constant(reg, const_ptr);
-
-    if (PrintRelocations && Verbose) {
-	puts("jobject2reg:\n");
-	printf("oop %p  at %p\n", o, const_ptr);
-	fflush(stdout);
-	das((uint64_t)__ pc(), -2);
-    }
+    __ movoop(reg, o, /*immediate*/true);
   }
 }
 
@@ -334,13 +324,16 @@
 void LIR_Assembler::jobject2reg_with_patching(Register reg, CodeEmitInfo *info) {
   // Allocate a new index in table to hold the object once it's been patched
   int oop_index = __ oop_recorder()->allocate_oop_index(NULL);
-//  PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_mirror_id, oop_index);
   PatchingStub* patch = new PatchingStub(_masm, patching_id(info), oop_index);
 
-  RelocationHolder rspec = oop_Relocation::spec(oop_index);
-  address const_ptr = int_constant(-1);
-  __ code()->consts()->relocate(const_ptr, rspec);
-  __ ldr_constant(reg, const_ptr);
+  if (DeoptimizeWhenPatching) {
+    __ nop();
+  } else {
+    RelocationHolder rspec = oop_Relocation::spec(oop_index);
+    address const_ptr = int_constant(-1);
+    __ code()->consts()->relocate(const_ptr, rspec);
+    __ ldr_constant(reg, InternalAddress(const_ptr));
+  }
   patching_epilog(patch, lir_patch_normal, reg, info);
 }
 
@@ -924,7 +917,10 @@
 void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo* info) {
   Metadata* o = NULL;
   PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_klass_id);
-  __ mov_metadata(reg, o);
+  if (DeoptimizeWhenPatching)
+    __ nop();
+  else
+    __ mov_metadata(reg, o);
   patching_epilog(patch, lir_patch_normal, reg, info);
 }
 
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/compiledIC_aarch64.cpp
--- a/src/cpu/aarch64/vm/compiledIC_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/compiledIC_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -139,6 +139,7 @@
 
   // Update stub.
   method_holder->set_data((intptr_t)callee());
+  method_holder->flush();
   jump->set_jump_destination(entry);
 
   // Update jump to call.
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -622,9 +622,10 @@
 
 void MacroAssembler::ic_call(address entry) {
   RelocationHolder rh = virtual_call_Relocation::spec(pc());
-  address const_ptr = long_constant((jlong)Universe::non_oop_word());
-  unsigned long offset;
-  ldr_constant(rscratch2, const_ptr);
+  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
+  // unsigned long offset;
+  // ldr_constant(rscratch2, const_ptr);
+  movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
   call(Address(entry, rh));
 }
 
@@ -2722,7 +2723,7 @@
   decode_klass_not_null(r, r);
 }
 
-void  MacroAssembler::set_narrow_oop(Register dst, jobject obj, bool mt_safe) {
+void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
   assert (UseCompressedOops, "should only be used for compressed oops");
   assert (Universe::heap() != NULL, "java heap should be initialized");
   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
@@ -2737,7 +2738,7 @@
   movk(dst, 0xBEEF);
 }
 
-void  MacroAssembler::set_narrow_klass(Register dst, Klass* k, bool mt_safe) {
+void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
   assert (UseCompressedClassPointers, "should only be used for compressed headers");
   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
   int index = oop_recorder()->find_index(k);
@@ -2970,11 +2971,11 @@
   return Address((address)obj, rspec);
 }
 
-// Move an oop into a register.  mt_safe is true iff we are not going
-// to patch this instruction while the code is being executed by
-// another thread.  In that case we can use move immediates rather
-// than the constant pool.
-void MacroAssembler::movoop(Register dst, jobject obj, bool mt_safe) {
+// Move an oop into a register.  immediate is true if we want
+// immediate instrcutions, i.e. we are not going to patch this
+// instruction while the code is being executed by another thread.  In
+// that case we can use move immediates rather than the constant pool.
+void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
   int oop_index;
   if (obj == NULL) {
     oop_index = oop_recorder()->allocate_oop_index(obj);
@@ -2983,17 +2984,16 @@
     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
   }
   RelocationHolder rspec = oop_Relocation::spec(oop_index);
-  address const_ptr = mt_safe ? long_constant((jlong)obj) : NULL;
-  if (! const_ptr) {
+  if (! immediate) {
+    address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
+    ldr_constant(dst, Address(dummy, rspec));
+  } else
     mov(dst, Address((address)obj, rspec));
-  } else {
-    code()->consts()->relocate(const_ptr, rspec);
-    ldr_constant(dst, const_ptr);
-  }
+
 }
 
 // Move a metadata address into a register.
-void MacroAssembler::mov_metadata(Register dst, Metadata* obj, bool mt_safe) {
+void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
   int oop_index;
   if (obj == NULL) {
     oop_index = oop_recorder()->allocate_metadata_index(obj);
@@ -3001,13 +3001,7 @@
     oop_index = oop_recorder()->find_index(obj);
   }
   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
-  address const_ptr = mt_safe ? long_constant((jlong)obj) : NULL;
-  if (! const_ptr) {
-    mov(dst, Address((address)obj, rspec));
-  } else {
-    code()->consts()->relocate(const_ptr, rspec);
-    ldr_constant(dst, const_ptr);
-  }
+  mov(dst, Address((address)obj, rspec));
 }
 
 Address MacroAssembler::constant_oop_address(jobject obj) {
@@ -3296,6 +3290,11 @@
 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
   relocInfo::relocType rtype = dest.rspec().reloc()->type();
   if (labs(pc() - dest.target()) >= (1LL << 32)) {
+    guarantee(rtype == relocInfo::none
+	      || rtype == relocInfo::external_word_type
+	      || rtype == relocInfo::poll_type
+	      || rtype == relocInfo::poll_return_type,
+	      "can only use a fixed address with an ADRP");
     // Out of range.  This doesn't happen very often, but we have to
     // handle it
     mov(reg1, dest);
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jul 15 14:43:14 2014 +0100
@@ -741,7 +741,7 @@
   void encode_heap_oop_not_null(Register dst, Register src);
   void decode_heap_oop_not_null(Register dst, Register src);
 
-  void set_narrow_oop(Register dst, jobject obj, bool mt_safe = true);
+  void set_narrow_oop(Register dst, jobject obj);
   // currently unimplemented
 #if 0
   void set_narrow_oop(Address dst, jobject obj);
@@ -754,7 +754,7 @@
   void encode_klass_not_null(Register dst, Register src);
   void decode_klass_not_null(Register dst, Register src);
 
-  void set_narrow_klass(Register dst, Klass* k, bool mt_safe = true);
+  void set_narrow_klass(Register dst, Klass* k);
   // currently unimplemented
 #if 0
   void set_narrow_klass(Address dst, Klass* k);
@@ -1110,7 +1110,7 @@
 
   // Data
 
-  void mov_metadata(Register dst, Metadata* obj, bool mt_safe = true);
+  void mov_metadata(Register dst, Metadata* obj);
   Address allocate_metadata_address(Metadata* obj);
   Address constant_oop_address(jobject obj);
   // unimplemented
@@ -1118,7 +1118,7 @@
   void pushoop(jobject obj);
 #endif
 
-  void movoop(Register dst, jobject obj, bool mt_safe = true);
+  void movoop(Register dst, jobject obj, bool immediate = false);
 
   // sign extend as need a l to ptr sized element
   void movl2ptr(Register dst, Address src) { Unimplemented(); }
@@ -1260,13 +1260,12 @@
     Label*   retaddr = NULL
   );
 
-  void ldr_constant(Register dest, address const_addr) {
-    guarantee(const_addr, "constant pool overflow");
+  void ldr_constant(Register dest, const Address &const_addr) {
     if (NearCpool) {
-      ldr(dest, const_addr, relocInfo::internal_word_type);
+      ldr(dest, const_addr);
     } else {
       unsigned long offset;
-      adrp(dest, InternalAddress(const_addr), offset);
+      adrp(dest, InternalAddress(const_addr.target()), offset);
       ldr(dest, Address(dest, offset));
     }
   }
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/nativeInst_aarch64.cpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -53,13 +53,6 @@
 // Inserts a native call instruction at a given pc
 void NativeCall::insert(address code_pos, address entry) { Unimplemented(); }
 
-// MT-safe patching of a call instruction.
-// First patches first word of instruction to two jmp's that jmps to them
-// selfs (spinlock). Then patches the last byte, and then atomicly replaces
-// the jmp's with the first 4 byte of the new instruction.
-void NativeCall::replace_mt_safe(address instr_addr, address code_buffer) { Unimplemented(); }
-
-
 void NativeMovConstReg::verify() {
   // make sure code pattern is actually mov reg64, imm64 instructions
 }
@@ -83,7 +76,6 @@
   }
 };
 
-
 void NativeMovConstReg::print() {
   tty->print_cr(PTR_FORMAT ": mov reg, " INTPTR_FORMAT,
                 instruction_address(), data());
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/nativeInst_aarch64.hpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Tue Jul 15 14:43:14 2014 +0100
@@ -202,8 +202,8 @@
   return call;
 }
 
-// An interface for accessing/manipulating native mov reg, imm32 instructions.
-// (used to manipulate inlined 32bit data dll calls, etc.)
+// An interface for accessing/manipulating native mov reg, imm instructions.
+// (used to manipulate inlined 64-bit data calls, etc.)
 class NativeMovConstReg: public NativeInstruction {
  public:
   enum Aarch64_specific_constants {
@@ -227,6 +227,12 @@
   intptr_t data() const;
   void  set_data(intptr_t x);
 
+  void flush() {
+    if (! maybe_cpool_ref(instruction_address())) {
+      ICache::invalidate_range(instruction_address(), instruction_size);
+    }
+  }
+
   void  verify();
   void  print();
 
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/relocInfo_aarch64.cpp
--- a/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -35,10 +35,20 @@
 void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
   switch(type()) {
   case relocInfo::oop_type:
-    MacroAssembler::patch_oop(addr(), x);
+    {
+      oop_Relocation *reloc = (oop_Relocation *)this;
+      if (NativeInstruction::is_ldr_literal_at(addr())) {
+	address constptr = (address)code()->oop_addr_at(reloc->oop_index());
+	MacroAssembler::pd_patch_instruction(addr(), constptr);
+	assert(*(address*)constptr == x, "error in oop relocation");
+      } else{
+	MacroAssembler::patch_oop(addr(), x);
+      }
+    }
     break;
   default:
     MacroAssembler::pd_patch_instruction(addr(), x);
+    break;
   }
 }
 
diff -r df067bdbe075 -r 632fea5ccac1 src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -1733,7 +1733,9 @@
   if (method->is_static() && !is_critical_native) {
 
     //  load oop into a register
-    __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
+    __ movoop(oop_handle_reg,
+              JNIHandles::make_local(method->method_holder()->java_mirror()),
+              /*immediate*/true);
 
     // Now handlize the static class mirror it's known not-null.
     __ str(oop_handle_reg, Address(sp, klass_offset));
diff -r df067bdbe075 -r 632fea5ccac1 src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp
--- a/src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp	Tue Jul 15 13:51:41 2014 +0100
+++ b/src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
@@ -100,7 +100,7 @@
   // even in its subfields (as defined by the CPU immediate fields,
   // if the CPU splits constants across multiple instructions).
 
-  return (char*) -1;
+  return (char*) 0xffffffffffff;
 }
 
 void os::initialize_thread(Thread *thr) {
-------------- next part --------------
# HG changeset patch
# User adinn
# Date 1405437601 -3600
# Node ID 11351da11922e5298def06ef484fd977bc6b3970
# Parent  632fea5ccac12d83ff4fb476e10d644c81bc7c35
Remove obsolete C1 patching code.

diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
@@ -320,133 +320,7 @@
 }
 
 void PatchingStub::emit_code(LIR_Assembler* ce) {
-  assert(NativeCall::instruction_size <= _bytes_to_copy && _bytes_to_copy <= 0xFF, "not enough room for call");
-
-  Label call_patch;
-
-  // static field accesses have special semantics while the class
-  // initializer is being run so we emit a test which can be used to
-  // check that this code is being executed by the initializing
-  // thread.
-  address being_initialized_entry = __ pc();
-  if (CommentedAssembly) {
-    __ block_comment(" patch template");
-  }
-
-  // make a copy the code which is going to be patched.
-  for (int i = 0; i < _bytes_to_copy; i++) {
-    address ptr = (address)(_pc_start + i);
-    int a_byte = (*ptr) & 0xFF;
-    __ emit_int8(a_byte);
-  }
-
-  address end_of_patch = __ pc();
-  int bytes_to_skip = 0;
-  if (_id == load_mirror_id) {
-    int offset = __ offset();
-    if (CommentedAssembly) {
-      __ block_comment(" being_initialized check");
-    }
-    assert(_obj != noreg, "must be a valid register");
-    Register tmp = r0;
-    Register tmp2 = r19;
-    __ stp(tmp, tmp2, Address(__ pre(sp, -2 * wordSize)));
-    // Load without verification to keep code size small. We need it because
-    // begin_initialized_entry_offset has to fit in a byte. Also, we know it's not null.
-    __ ldr(tmp2, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
-    __ ldr(tmp, Address(tmp2, InstanceKlass::init_thread_offset()));
-    __ cmp(rthread, tmp);
-    __ ldp(tmp, tmp2, Address(__ post(sp, 2 * wordSize)));
-    __ br(Assembler::NE, call_patch);
-
-    // access_field patches may execute the patched code before it's
-    // copied back into place so we need to jump back into the main
-    // code of the nmethod to continue execution.
-    __ b(_patch_site_continuation);
-
-    // make sure this extra code gets skipped
-    bytes_to_skip += __ offset() - offset;
-  }
-  if (CommentedAssembly) {
-    __ block_comment("patch data");
-  }
-  // Now emit the patch record telling the runtime how to find the
-  // pieces of the patch.
-  int sizeof_patch_record = 8;
-  bytes_to_skip += sizeof_patch_record;
-
-  // emit the offsets needed to find the code to patch
-  int being_initialized_entry_offset = __ pc() - being_initialized_entry + sizeof_patch_record;
-
-  // If this is a field access, the offset is held in the constant
-  // pool rather than embedded in the instruction, so we don't copy
-  // any instructions: we set the value in the constant pool and
-  // overwrite the NativeGeneralJump.
-  {
-    Label L;
-    __ br(Assembler::AL, L);
-    __ emit_int8(0);
-    __ emit_int8(being_initialized_entry_offset);
-    if (_id == access_field_id) {
-      __ emit_int8(bytes_to_skip + _bytes_to_copy);
-      __ emit_int8(0);
-    } else {
-      __ emit_int8(bytes_to_skip);
-      __ emit_int8(_bytes_to_copy);
-    }
-    __ bind(L);
-  }
-
-  address patch_info_pc = __ pc();
-  assert(patch_info_pc - end_of_patch == bytes_to_skip, "incorrect patch info");
-
-  address entry = __ pc();
-  NativeGeneralJump::insert_unconditional((address)_pc_start, entry);
-  address target = NULL;
-  relocInfo::relocType reloc_type = relocInfo::none;
-
-  switch (_id) {
-  case access_field_id:
-    target = Runtime1::entry_for(Runtime1::access_field_patching_id);
-    reloc_type = relocInfo::section_word_type;
-    break;
-  case load_klass_id:
-    target = Runtime1::entry_for(Runtime1::load_klass_patching_id);
-    reloc_type = relocInfo::metadata_type;
-    break;
-  case load_mirror_id:
-    target = Runtime1::entry_for(Runtime1::load_mirror_patching_id);
-    reloc_type = relocInfo::oop_type;
-    break;
-  case load_appendix_id:
-    target = Runtime1::entry_for(Runtime1::load_appendix_patching_id);
-    reloc_type = relocInfo::oop_type;
-    break;
-  default: ShouldNotReachHere();
-  }
-
-  __ bind(call_patch);
-
-  if (CommentedAssembly) {
-    __ block_comment("patch entry point");
-  }
-  __ bl(RuntimeAddress(target));
-  assert(_patch_info_offset == (patch_info_pc - __ pc()), "must not change");
-  ce->add_call_info_here(_info);
-  int jmp_off = __ offset();
-  __ b(_patch_site_entry);
-  // Add enough nops so deoptimization can overwrite the jmp above with a call
-  // and not destroy the world.
-  // FIXME: AArch64 doesn't really need this
-  // __ nop(); __ nop();
-  // if (_id == load_klass_id 
-  //     || _id == load_mirror_id
-  //     || _id == access_field_id
-  //     ) {
-  //   CodeSection* cs = __ code_section();
-  //   RelocIterator iter(cs, (address)_pc_start, (address)(_pc_start + 1));
-  //   relocInfo::change_reloc_info_for_address(&iter, (address) _pc_start, reloc_type, relocInfo::none);
-  // }
+  assert(false, "AArch64 should not use C1 runtime patching");
 }
 
 
diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
@@ -26,6 +26,7 @@
 
 #include "precompiled.hpp"
 #include "asm/assembler.hpp"
+#include "c1/c1_CodeStubs.hpp"
 #include "c1/c1_Compilation.hpp"
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
@@ -200,10 +201,7 @@
     if (Address::offset_ok_for_immed(addr_offset, addr->scale()))
       return Address(base, addr_offset, Address::lsl(addr->scale()));
     else {
-      // This is a rather long-winded instruction sequence, but the
-      // offset is atomically patchable.  See PatchingStub::install().
-      Address const_addr = InternalAddress(int_constant(addr_offset));
-      __ ldr_constant(tmp, const_addr);
+      __ mov(tmp, addr_offset);
       return Address(base, tmp, Address::lsl(addr->scale()));
     }
   }
@@ -321,20 +319,36 @@
 }
 
 
+void LIR_Assembler::deoptimize_trap(CodeEmitInfo *info) {
+  address target = NULL;
+  relocInfo::relocType reloc_type = relocInfo::none;
+
+  switch (patching_id(info)) {
+  case PatchingStub::access_field_id:
+    target = Runtime1::entry_for(Runtime1::access_field_patching_id);
+    reloc_type = relocInfo::section_word_type;
+    break;
+  case PatchingStub::load_klass_id:
+    target = Runtime1::entry_for(Runtime1::load_klass_patching_id);
+    reloc_type = relocInfo::metadata_type;
+    break;
+  case PatchingStub::load_mirror_id:
+    target = Runtime1::entry_for(Runtime1::load_mirror_patching_id);
+    reloc_type = relocInfo::oop_type;
+    break;
+  case PatchingStub::load_appendix_id:
+    target = Runtime1::entry_for(Runtime1::load_appendix_patching_id);
+    reloc_type = relocInfo::oop_type;
+    break;
+  default: ShouldNotReachHere();
+  }
+
+  __ bl(RuntimeAddress(target));
+  add_call_info_here(info);
+}
+
 void LIR_Assembler::jobject2reg_with_patching(Register reg, CodeEmitInfo *info) {
-  // Allocate a new index in table to hold the object once it's been patched
-  int oop_index = __ oop_recorder()->allocate_oop_index(NULL);
-  PatchingStub* patch = new PatchingStub(_masm, patching_id(info), oop_index);
-
-  if (DeoptimizeWhenPatching) {
-    __ nop();
-  } else {
-    RelocationHolder rspec = oop_Relocation::spec(oop_index);
-    address const_ptr = int_constant(-1);
-    __ code()->consts()->relocate(const_ptr, rspec);
-    __ ldr_constant(reg, InternalAddress(const_ptr));
-  }
-  patching_epilog(patch, lir_patch_normal, reg, info);
+  deoptimize_trap(info);
 }
 
 
@@ -801,23 +815,21 @@
   PatchingStub* patch = NULL;
   Register compressed_src = rscratch1;
 
+  if (patch_code != lir_patch_none) {
+    deoptimize_trap(info);
+    return;
+  }
+
   if (type == T_ARRAY || type == T_OBJECT) {
     __ verify_oop(src->as_register());
 
     if (UseCompressedOops && !wide) {
       __ encode_heap_oop(compressed_src, src->as_register());
-      if (patch_code != lir_patch_none) {
-        info->oop_map()->set_narrowoop(compressed_src->as_VMReg());
-      }
     } else {
       compressed_src = src->as_register();
     }
   }
 
-  if (patch_code != lir_patch_none) {
-    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
-  }
-
   int null_check_here = code_offset();
   switch (type) {
     case T_FLOAT: {
@@ -875,10 +887,6 @@
   if (info != NULL) {
     add_debug_info_for_null_check(null_check_here, info);
   }
-
-  if (patch_code != lir_patch_none) {
-    patching_epilog(patch, patch_code, to_addr->base()->as_register(), info);
-  }
 }
 
 
@@ -915,13 +923,31 @@
 
 
 void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo* info) {
-  Metadata* o = NULL;
-  PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_klass_id);
-  if (DeoptimizeWhenPatching)
-    __ nop();
-  else
-    __ mov_metadata(reg, o);
-  patching_epilog(patch, lir_patch_normal, reg, info);
+  address target = NULL;
+  relocInfo::relocType reloc_type = relocInfo::none;
+
+  switch (patching_id(info)) {
+  case PatchingStub::access_field_id:
+    target = Runtime1::entry_for(Runtime1::access_field_patching_id);
+    reloc_type = relocInfo::section_word_type;
+    break;
+  case PatchingStub::load_klass_id:
+    target = Runtime1::entry_for(Runtime1::load_klass_patching_id);
+    reloc_type = relocInfo::metadata_type;
+    break;
+  case PatchingStub::load_mirror_id:
+    target = Runtime1::entry_for(Runtime1::load_mirror_patching_id);
+    reloc_type = relocInfo::oop_type;
+    break;
+  case PatchingStub::load_appendix_id:
+    target = Runtime1::entry_for(Runtime1::load_appendix_patching_id);
+    reloc_type = relocInfo::oop_type;
+    break;
+  default: ShouldNotReachHere();
+  }
+
+  __ bl(RuntimeAddress(target));
+  add_call_info_here(info);
 }
 
 void LIR_Assembler::stack2stack(LIR_Opr src, LIR_Opr dest, BasicType type) {
@@ -944,10 +970,9 @@
     __ verify_oop(addr->base()->as_pointer_register());
   }
 
-  PatchingStub* patch = NULL;
-
   if (patch_code != lir_patch_none) {
-    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+    deoptimize_trap(info);
+    return;
   }
 
   if (info != NULL) {
@@ -1019,10 +1044,6 @@
       ShouldNotReachHere();
   }
 
-  if (patch != NULL) {
-    patching_epilog(patch, patch_code, addr->base()->as_register(), info);
-  }
-
   if (type == T_ARRAY || type == T_OBJECT) {
 #ifdef _LP64
     if (UseCompressedOops && !wide) {
diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp	Tue Jul 15 16:20:01 2014 +0100
@@ -64,6 +64,8 @@
 
   void init() { tableswitch_count = 0; }
 
+  void deoptimize_trap(CodeEmitInfo *info);
+
 public:
 
   void store_parameter(Register r, int offset_from_esp_in_words);
diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
@@ -1321,19 +1321,6 @@
 
 #undef __
 
-static Klass* resolve_field_return_klass(methodHandle caller, int bci, TRAPS) {
-  Bytecode_field field_access(caller, bci);
-  // This can be static or non-static field access
-  Bytecodes::Code code       = field_access.code();
-
-  // We must load class, initialize class and resolvethe field
-  fieldDescriptor result; // initialize class if needed
-  constantPoolHandle constants(THREAD, caller->constants());
-  LinkResolver::resolve_field_access(result, constants, field_access.index(), Bytecodes::java_code(code), CHECK_NULL);
-  return result.field_holder();
-}
-
-
 // Simple helper to see if the caller of a runtime stub which
 // entered the VM has been deoptimized
 
@@ -1347,260 +1334,41 @@
 }
 
 JRT_ENTRY(void, Runtime1::patch_code_aarch64(JavaThread* thread, Runtime1::StubID stub_id ))
+{
+  RegisterMap reg_map(thread, false);
+
   NOT_PRODUCT(_patch_code_slowcase_cnt++;)
 
-  ResourceMark rm(thread);
-  RegisterMap reg_map(thread, false);
+  // According to the ARMv8 ARM, "Concurrent modification and
+  // execution of instructions can lead to the resulting instruction
+  // performing any behavior that can be achieved by executing any
+  // sequence of instructions that can be executed from the same
+  // Exception level, except where the instruction before
+  // modification and the instruction after modification is a B, BL,
+  // NOP, BKPT, SVC, HVC, or SMC instruction."
+  //
+  // This effectively makes the games we play when patching
+  // impossible, so when we come across an access that needs
+  // patching we must deoptimize.
+
+  if (TracePatching) {
+    tty->print_cr("Deoptimizing because patch is needed");
+  }
+
   frame runtime_frame = thread->last_frame();
   frame caller_frame = runtime_frame.sender(&reg_map);
 
-  if (DeoptimizeWhenPatching) {
-    // According to the ARMv8 ARM, "Concurrent modification and
-    // execution of instructions can lead to the resulting instruction
-    // performing any behavior that can be achieved by executing any
-    // sequence of instructions that can be executed from the same
-    // Exception level, except where the instruction before
-    // modification and the instruction after modification is a B, BL,
-    // NOP, BKPT, SVC, HVC, or SMC instruction."
-    //
-    // This effectively makes the games we play when patching
-    // impossible, so when we come across an access that needs
-    // patching we must deoptimize.
-
-    if (TracePatching) {
-      tty->print_cr("Deoptimizing because patch is needed");
-    }
-    // It's possible the nmethod was invalidated in the last
-    // safepoint, but if it's still alive then make it not_entrant.
-    nmethod* nm = CodeCache::find_nmethod(caller_frame.pc());
-    if (nm != NULL) {
-      nm->make_not_entrant();
-    }
-
-    Deoptimization::deoptimize_frame(thread, caller_frame.id());
-
-    // Return to the now deoptimized frame.
-    return;
+  // It's possible the nmethod was invalidated in the last
+  // safepoint, but if it's still alive then make it not_entrant.
+  nmethod* nm = CodeCache::find_nmethod(caller_frame.pc());
+  if (nm != NULL) {
+    nm->make_not_entrant();
   }
 
-  // last java frame on stack
-  vframeStream vfst(thread, true);
-  assert(!vfst.at_end(), "Java frame must exist");
+  Deoptimization::deoptimize_frame(thread, caller_frame.id());
 
-  methodHandle caller_method(THREAD, vfst.method());
-  // Note that caller_method->code() may not be same as caller_code because of OSR's
-  // Note also that in the presence of inlining it is not guaranteed
-  // that caller_method() == caller_code->method()
-
-  int bci = vfst.bci();
-  Bytecodes::Code code = caller_method()->java_code_at(bci);
-
-  bool deoptimize_for_volatile = false;
-  int patch_field_offset = -1;
-  KlassHandle init_klass(THREAD, NULL); // klass needed by load_klass_patching code
-  KlassHandle load_klass(THREAD, NULL); // klass needed by load_klass_patching code
-  Handle mirror(THREAD, NULL);                    // oop needed by load_mirror_patching code
-  fieldDescriptor result; // initialize class if needed
-
-  bool load_klass_or_mirror_patch_id =
-    (stub_id == Runtime1::load_klass_patching_id || stub_id == Runtime1::load_mirror_patching_id);
-
-  if (stub_id == Runtime1::access_field_patching_id) {
-
-    Bytecode_field field_access(caller_method, bci);
-    fieldDescriptor result; // initialize class if needed
-    Bytecodes::Code code = field_access.code();
-    constantPoolHandle constants(THREAD, caller_method->constants());
-    LinkResolver::resolve_field_access(result, constants, field_access.index(), Bytecodes::java_code(code), CHECK);
-    patch_field_offset = result.offset();
-
-    // If we're patching a field which is volatile then at compile it
-    // must not have been known to be volatile, so the generated code
-    // isn't correct for a volatile reference.  The nmethod has to be
-    // deoptimized so that the code can be regenerated correctly.
-    // This check is only needed for access_field_patching since this
-    // is the path for patching field offsets.  load_klass is only
-    // used for patching references to oops which don't need special
-    // handling in the volatile case.
-    deoptimize_for_volatile = result.access_flags().is_volatile();
-  } else if (load_klass_or_mirror_patch_id) {
-    Klass* k = NULL;
-    switch (code) {
-      case Bytecodes::_putstatic:
-      case Bytecodes::_getstatic:
-        { Klass* klass = resolve_field_return_klass(caller_method, bci, CHECK);
-          init_klass = KlassHandle(THREAD, klass);
-          mirror = Handle(THREAD, klass->java_mirror());
-        }
-        break;
-      case Bytecodes::_new:
-        { Bytecode_new bnew(caller_method(), caller_method->bcp_from(bci));
-          k = caller_method->constants()->klass_at(bnew.index(), CHECK);
-        }
-        break;
-      case Bytecodes::_multianewarray:
-        { Bytecode_multianewarray mna(caller_method(), caller_method->bcp_from(bci));
-          k = caller_method->constants()->klass_at(mna.index(), CHECK);
-        }
-        break;
-      case Bytecodes::_instanceof:
-        { Bytecode_instanceof io(caller_method(), caller_method->bcp_from(bci));
-          k = caller_method->constants()->klass_at(io.index(), CHECK);
-        }
-        break;
-      case Bytecodes::_checkcast:
-        { Bytecode_checkcast cc(caller_method(), caller_method->bcp_from(bci));
-          k = caller_method->constants()->klass_at(cc.index(), CHECK);
-        }
-        break;
-      case Bytecodes::_anewarray:
-        { Bytecode_anewarray anew(caller_method(), caller_method->bcp_from(bci));
-          Klass* ek = caller_method->constants()->klass_at(anew.index(), CHECK);
-          k = ek->array_klass(CHECK);
-        }
-        break;
-      case Bytecodes::_ldc:
-      case Bytecodes::_ldc_w:
-        {
-          Bytecode_loadconstant cc(caller_method, bci);
-          oop m = cc.resolve_constant(CHECK);
-          mirror = Handle(THREAD, m);
-        }
-        break;
-      default: Unimplemented();
-    }
-    // convert to handle
-    load_klass = KlassHandle(THREAD, k);
-  } else {
-    ShouldNotReachHere();
-  }
-
-  if (deoptimize_for_volatile) {
-    // At compile time we assumed the field wasn't volatile but after
-    // loading it turns out it was volatile so we have to throw the
-    // compiled code out and let it be regenerated.
-    if (TracePatching) {
-      tty->print_cr("Deoptimizing for patching volatile field reference");
-    }
-    // It's possible the nmethod was invalidated in the last
-    // safepoint, but if it's still alive then make it not_entrant.
-    nmethod* nm = CodeCache::find_nmethod(caller_frame.pc());
-    if (nm != NULL) {
-      nm->make_not_entrant();
-    }
-
-    Deoptimization::deoptimize_frame(thread, caller_frame.id());
-
-    // Return to the now deoptimized frame.
-  }
-
-  // If we are patching in a non-perm oop, make sure the nmethod
-  // is on the right list.
-  if (ScavengeRootsInCode && mirror.not_null() && mirror()->is_scavengable()) {
-    MutexLockerEx ml_code (CodeCache_lock, Mutex::_no_safepoint_check_flag);
-    nmethod* nm = CodeCache::find_nmethod(caller_frame.pc());
-    guarantee(nm != NULL, "only nmethods can contain non-perm oops");
-    if (!nm->on_scavenge_root_list())
-      CodeCache::add_scavenge_root_nmethod(nm);
-  }
-
-  // Now copy code back
-  {
-    MutexLockerEx ml_patch (Patching_lock, Mutex::_no_safepoint_check_flag);
-    //
-    // Deoptimization may have happened while we waited for the lock.
-    // In that case we don't bother to do any patching we just return
-    // and let the deopt happen
-    if (!caller_is_deopted()) {
-      NativeGeneralJump* jump = nativeGeneralJump_at(caller_frame.pc());
-      address instr_pc = jump->jump_destination();
-      NativeInstruction* ni = nativeInstruction_at(instr_pc);
-      if (ni->is_jump() ) {
-	// the jump has not been patched yet
-	address stub_location = caller_frame.pc() + PatchingStub::patch_info_offset();
-	unsigned char* byte_count = (unsigned char*) (stub_location - 1);
-	unsigned char* byte_skip = (unsigned char*) (stub_location - 2);
-	unsigned char* being_initialized_entry_offset = (unsigned char*) (stub_location - 3);
-	address copy_buff = stub_location - *byte_skip - *byte_count;
-	address being_initialized_entry = stub_location - *being_initialized_entry_offset;
-	if (TracePatching) {
-	  tty->print_cr(" Patching %s at bci %d at address 0x%x  (%s)", Bytecodes::name(code), bci,
-			instr_pc, (stub_id == Runtime1::access_field_patching_id) ? "field" : "klass");
-	  nmethod* caller_code = CodeCache::find_nmethod(caller_frame.pc());
-	  assert(caller_code != NULL, "nmethod not found");
-
-	  // NOTE we use pc() not original_pc() because we already know they are
-	  // identical otherwise we'd have never entered this block of code
-	  OopMap* map = caller_code->oop_map_for_return_address(caller_frame.pc());
-	  assert(map != NULL, "null check");
-	  map->print();
-	  tty->cr();
-
-	  Disassembler::decode(copy_buff, copy_buff + *byte_count, tty);
-	}
-
-	// The word in the constant pool needs fixing.
-	unsigned insn = *(unsigned*)copy_buff;
-	unsigned long *cpool_addr
-	  = (unsigned long *)MacroAssembler::target_addr_for_insn(instr_pc, insn);
-
-	nmethod* nm = CodeCache::find_nmethod(caller_frame.pc());
-	CodeBlob *cb = CodeCache::find_blob(caller_frame.pc());
-	assert(nm != NULL, "invalid nmethod_pc");
-	assert(address(cpool_addr) >= nm->consts_begin()
-	       && address(cpool_addr) < nm->consts_end(),
-	       "constant address should be inside constant pool");
-
-	switch(stub_id) {
-	case access_field_patching_id:
-	  *cpool_addr = patch_field_offset; break;
-	case load_mirror_patching_id:
-	  *cpool_addr = cast_from_oop<uint64_t>(mirror()); break;
-	case load_klass_patching_id:
-	  *cpool_addr = (uint64_t)load_klass(); break;
-	default:
-	  ShouldNotReachHere();
-	}
-
-	// Update the location in the nmethod with the proper
-	// metadata.  When the code was generated, a NULL was stuffed
-	// in the metadata table and that table needs to be update to
-	// have the right value.  On intel the value is kept
-	// directly in the instruction instead of in the metadata
-	// table, so set_data above effectively updated the value.
-	//
-	// FIXME: It's tempting to think that rather them putting OOPs
-	// in the cpool we could refer directly to the locations in the
-	// nmethod.  However, we can't guarantee that an ADRP would be
-	// able to reach them: an ADRP can only reach within +- 4GiB of
-	// the PC using two instructions.  While it's pretty unlikely
-	// that we will exceed this limit, it's not impossible.
-	RelocIterator mds(nm, (address)cpool_addr, (address)cpool_addr + 1);
-	bool found = false;
-	while (mds.next() && !found) {
-	  if (mds.type() == relocInfo::oop_type) {
-	    assert(stub_id == Runtime1::load_mirror_patching_id, "wrong stub id");
-	    oop_Relocation* r = mds.oop_reloc();
-	    oop* oop_adr = r->oop_addr();
-	    *oop_adr = mirror();
-	    r->fix_oop_relocation();
-	    found = true;
-	  } else if (mds.type() == relocInfo::metadata_type) {
-	    assert(stub_id == Runtime1::load_klass_patching_id, "wrong stub id");
-	    metadata_Relocation* r = mds.metadata_reloc();
-	    Metadata** metadata_adr = r->metadata_addr();
-	    *metadata_adr = load_klass();
-	    r->fix_metadata_relocation();
-	    found = true;
-	  }
-	}
-
-	// And we overwrite the jump
-	NativeGeneralJump::replace_mt_safe(instr_pc, copy_buff);
-
-      }
-    }
-  }
+  // Return to the now deoptimized frame.
+}
 
 JRT_END
 
diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/globals_aarch64.hpp
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Tue Jul 15 16:20:01 2014 +0100
@@ -93,9 +93,6 @@
   product(bool, NearCpool, true,                                        \
          "constant pool is close to instructions")                      \
                                                                         \
-  product(bool, DeoptimizeWhenPatching, true,                           \
-          "doptimize instead of patching instructions")                 \
-									\
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
 	     "Use acquire and release insns for volatile fields")
 
@@ -113,9 +110,6 @@
   product(bool, NearCpool, true,					\
          "constant pool is close to instructions")			\
                                                                         \
-  product(bool, DeoptimizeWhenPatching, true,                           \
-          "doptimize instead of patching instructions")			\
-									\
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
 	     "Use acquire and release insns for volatile fields")       \
   product(bool, UseNeon, false,                                         \
diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/nativeInst_aarch64.cpp
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
@@ -242,8 +242,7 @@
 
 // MT-safe patching of a long jump instruction.
 void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
-  assert((! DeoptimizeWhenPatching)
-	 || nativeInstruction_at(instr_addr)->is_jump_or_nop(),
+  assert(nativeInstruction_at(instr_addr)->is_jump_or_nop(),
 	 "Aarch64 cannot replace non-jump with jump");
   uint32_t instr = *(uint32_t*)code_buffer;
   *(uint32_t*)instr_addr = instr;
diff -r 632fea5ccac1 -r 11351da11922 src/cpu/aarch64/vm/relocInfo_aarch64.cpp
--- a/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
@@ -90,78 +90,3 @@
 
 void metadata_Relocation::pd_fix_value(address x) {
 }
-
-// We have a relocation that points to a pair of instructions that
-// load a constant from the constant pool.  These are
-// ARDP; LDR reg [reg, #ofs].  However, until the constant is resolved
-// the first instruction may be a branch to a resolver stub, and the
-// resolver stub contains a copy of the ADRP that will replace the
-// branch instruction.
-//
-// So, when we relocate this code we have to adjust the offset in the
-// LDR instruction and the page offset in the copy of the ADRP
-// instruction that will overwrite the branch instruction.  This is
-// done by Runtime1::patch_code_aarch64.
-
-void section_word_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
-  unsigned insn1 = *(unsigned*)addr();
-  if (! (Instruction_aarch64::extract(insn1, 30, 26) == 0b00101)) {
-    // Unconditional branch (immediate)
-    internal_word_Relocation::fix_relocation_after_move(src, dest);
-    return;
-  }
-
-  address new_address = target();
-#ifdef ASSERT
-  // Make sure this really is a cpool address
-  address old_cpool_start = const_cast<CodeBuffer*>(src)->consts()->start();
-  address old_cpool_end = const_cast<CodeBuffer*>(src)->consts()->end();
-  address new_cpool_start =  const_cast<CodeBuffer*>(dest)->consts()->start();
-  address new_cpool_end =  const_cast<CodeBuffer*>(dest)->consts()->end();
-  address old_address = old_addr_for(target(), src, dest);
-  assert(new_address >= new_cpool_start
-	 && new_address < new_cpool_end,
-	 "should be");
-  assert(old_address >= old_cpool_start
-	 && old_address < old_cpool_end,
-	 "should be");
-#endif
-
-  address stub_location = pd_call_destination(addr());
-  unsigned char* byte_count = (unsigned char*) (stub_location - 1);
-  unsigned char* byte_skip = (unsigned char*) (stub_location - 2);
-  address copy_buff = stub_location - *byte_skip - *byte_count;
-  unsigned insn3 = *(unsigned*)copy_buff;
-
-  if (NearCpool) {
-    int offset = new_address - addr();
-    Instruction_aarch64::spatch(copy_buff, 23, 5, offset >> 2);
-  } else {
-    // Unconditional branch (immediate)
-    unsigned insn2 = ((unsigned*)addr())[1];
-    if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001) {
-      // Load/store register (unsigned immediate)
-      unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
-
-      // Offset of address in a 4k page
-      uint64_t new_offset = (uint64_t)target() & ((1<<12) - 1);
-      // Fix the LDR instruction's offset
-      Instruction_aarch64::patch(addr() + sizeof (unsigned),
-				 21, 10, new_offset >> size);
-
-      assert(Instruction_aarch64::extract(insn3, 28, 24) == 0b10000
-	     && Instruction_aarch64::extract(insn3, 31, 31),
-	     "instruction should be an ADRP");
-
-      uint64_t insn_page = (uint64_t)addr() >> 12;
-      uint64_t target_page = (uint64_t)target() >> 12;
-      int page_offset = target_page - insn_page;
-      int page_offset_lo = page_offset & 3;
-      page_offset >>= 2;
-      Instruction_aarch64::spatch(copy_buff, 23, 5, page_offset);
-      Instruction_aarch64::patch(copy_buff, 30, 29, page_offset_lo);
-
-      // Phew.
-    }
-  }
-}
diff -r 632fea5ccac1 -r 11351da11922 src/share/vm/code/relocInfo.hpp
--- a/src/share/vm/code/relocInfo.hpp	Tue Jul 15 14:43:14 2014 +0100
+++ b/src/share/vm/code/relocInfo.hpp	Tue Jul 15 16:20:01 2014 +0100
@@ -1269,10 +1269,6 @@
   //void pack_data_to -- inherited
   void unpack_data();
 
-#ifdef TARGET_ARCH_aarch64
-  void fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest);
-#endif
-
  private:
   friend class RelocIterator;
   section_word_Relocation() { }
-------------- next part --------------
# HG changeset patch
# User adinn
# Date 1405439279 -3600
# Node ID 3c01fe371d7cadcb90c1ede5598a0e320f4a4058
# Parent  11351da11922e5298def06ef484fd977bc6b3970
Improve C1 performance improvements in ic_cache checks

diff -r 11351da11922 -r 3c01fe371d7c src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 15 16:47:59 2014 +0100
@@ -290,23 +290,25 @@
 int LIR_Assembler::check_icache() {
   Register receiver = FrameMap::receiver_opr->as_register();
   Register ic_klass = IC_Klass;
-  const int ic_cmp_size = 4 * 4;
-  const bool do_post_padding = VerifyOops || UseCompressedClassPointers;
-  if (!do_post_padding) {
-    // insert some nops so that the verified entry point is aligned on CodeEntryAlignment
-    while ((__ offset() + ic_cmp_size) % CodeEntryAlignment != 0) {
-      __ nop();
-    }
-  }
-  int offset = __ offset();
-  __ inline_cache_check(receiver, IC_Klass);
-  assert(__ offset() % CodeEntryAlignment == 0 || do_post_padding, "alignment must be correct");
-  if (do_post_padding) {
+  int start_offset = __ offset();
+  __ inline_cache_check(receiver, ic_klass);
+
+  // if icache check fails, then jump to runtime routine
+  // Note: RECEIVER must still contain the receiver!
+  Label dont;
+  __ br(Assembler::EQ, dont);
+  __ b(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+  // We align the verified entry point unless the method body
+  // (including its inline cache check) will fit in a single 64-byte
+  // icache line.
+  if (! method()->is_accessor() || __ offset() - start_offset > 4 * 4) {
     // force alignment after the cache check.
-    // It's been verified to be aligned if !VerifyOops
     __ align(CodeEntryAlignment);
   }
-  return offset;
+
+  __ bind(dont);
+  return start_offset;
 }
 
 
diff -r 11351da11922 -r 3c01fe371d7c src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Tue Jul 15 16:47:59 2014 +0100
@@ -404,19 +404,7 @@
   // explicit NULL check not needed since load from [klass_offset] causes a trap
   // check against inline cache
   assert(!MacroAssembler::needs_explicit_null_check(oopDesc::klass_offset_in_bytes()), "must add explicit null check");
-  int start_offset = offset();
-
-  load_klass(rscratch1, receiver);
-  cmp(rscratch1, iCache);
-
-  // if icache check fails, then jump to runtime routine
-  // Note: RECEIVER must still contain the receiver!
-  Label dont;
-  br(Assembler::EQ, dont);
-  b(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
-  bind(dont);
-  const int ic_cmp_size = 4 * 4;
-  assert(UseCompressedClassPointers || offset() - start_offset == ic_cmp_size, "check alignment in emit_method_entry");
+  cmp_klass(receiver, iCache, rscratch1);
 }
 
 
diff -r 11351da11922 -r 3c01fe371d7c src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp	Tue Jul 15 16:20:01 2014 +0100
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp	Tue Jul 15 16:47:59 2014 +0100
@@ -102,20 +102,6 @@
   int  rsp_offset() const { return _rsp_offset; }
   void set_rsp_offset(int n) { _rsp_offset = n; }
 
-  // Note: NEVER push values directly, but only through following push_xxx functions;
-  //       This helps us to track the rsp changes compared to the entry rsp (->_rsp_offset)
-
-  void push_jint (jint i)     { Unimplemented(); }
-  void push_oop  (jobject o)  { Unimplemented(); }
-  // Seems to always be in wordSize
-  void push_addr (Address a)  { Unimplemented(); }
-  void push_reg  (Register r) { Unimplemented(); }
-  void pop_reg   (Register r) { Unimplemented(); }
-
-  void dec_stack (int nof_words) { Unimplemented(); }
-
-  void dec_stack_after_call (int nof_words) { Unimplemented(); }
-
   void invalidate_registers(bool inv_r0, bool inv_r19, bool inv_r2, bool inv_r3, bool inv_r4, bool inv_r5) PRODUCT_RETURN;
 
 #endif // CPU_AARCH64_VM_C1_MACROASSEMBLER_AARCH64_HPP
diff -r 11351da11922 -r 3c01fe371d7c src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 16:47:59 2014 +0100
@@ -1842,8 +1842,12 @@
 void MacroAssembler::reinit_heapbase()
 {
   if (UseCompressedOops) {
-    lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
-    ldr(rheapbase, Address(rheapbase));
+    if (Universe::is_fully_initialized()) {
+      mov(rheapbase, Universe::narrow_ptrs_base());
+    } else {
+      lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
+      ldr(rheapbase, Address(rheapbase));
+    }
   }
 }
 
diff -r 11351da11922 -r 3c01fe371d7c src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jul 15 16:20:01 2014 +0100
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jul 15 16:47:59 2014 +0100
@@ -1527,8 +1527,7 @@
 
   assert_different_registers(ic_reg, receiver, rscratch1);
   __ verify_oop(receiver);
-  __ load_klass(rscratch1, receiver);
-  __ cmp(ic_reg, rscratch1);
+  __ cmp_klass(receiver, ic_reg, rscratch1);
   __ br(Assembler::EQ, hit);
 
   __ b(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
-------------- next part --------------
# HG changeset patch
# User adinn
# Date 1405443836 -3600
# Node ID 845014e20c17f0f614081fcbff5c1cc0d77ec665
# Parent  3c01fe371d7cadcb90c1ede5598a0e320f4a4058
Fast string comparison

diff -r 3c01fe371d7c -r 845014e20c17 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 15 16:47:59 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 15 18:03:56 2014 +0100
@@ -375,6 +375,15 @@
     R30
 );
 
+// Singleton class for R0 int register
+reg_class int_r0_reg(R0);
+
+// Singleton class for R2 int register
+reg_class int_r2_reg(R2);
+
+// Singleton class for R4 int register
+reg_class int_r4_reg(R4);
+
 // Class for all long integer registers (including RSP)
 reg_class any_reg(
     R0, R0_H,
@@ -482,11 +491,21 @@
     R0, R0_H
 );
 
+// Class for 64 bit register r1
+reg_class r1_reg(
+    R1, R1_H
+);
+
 // Class for 64 bit register r2
 reg_class r2_reg(
     R2, R2_H
 );
 
+// Class for 64 bit register r3
+reg_class r3_reg(
+    R3, R3_H
+);
+
 // Class for 64 bit register r4
 reg_class r4_reg(
     R4, R4_H
@@ -3916,6 +3935,18 @@
   interface(REG_INTER);
 %}
 
+// Pointer 64 bit Register R1 only
+operand iRegP_R1()
+%{
+  constraint(ALLOC_IN_RC(r1_reg));
+  match(RegP);
+  // match(iRegP);
+  match(iRegPNoSp);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Pointer 64 bit Register R2 only
 operand iRegP_R2()
 %{
@@ -3928,6 +3959,18 @@
   interface(REG_INTER);
 %}
 
+// Pointer 64 bit Register R3 only
+operand iRegP_R3()
+%{
+  constraint(ALLOC_IN_RC(r3_reg));
+  match(RegP);
+  // match(iRegP);
+  match(iRegPNoSp);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Pointer 64 bit Register R4 only
 operand iRegP_R4()
 %{
@@ -3989,7 +4032,29 @@
 // Register R0 only
 operand iRegI_R0()
 %{
-  constraint(ALLOC_IN_RC(r0_reg));
+  constraint(ALLOC_IN_RC(int_r0_reg));
+  match(RegI);
+  match(iRegINoSp);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Register R2 only
+operand iRegI_R2()
+%{
+  constraint(ALLOC_IN_RC(int_r2_reg));
+  match(RegI);
+  match(iRegINoSp);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Register R4 only
+operand iRegI_R4()
+%{
+  constraint(ALLOC_IN_RC(int_r4_reg));
   match(RegI);
   match(iRegINoSp);
   op_cost(0);
@@ -11211,6 +11276,21 @@
   ins_pipe(pipe_class_memory);
 %}
 
+instruct string_compare(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
+                        iRegI_R0 result, iRegP_R10 tmp1, rFlagsReg cr)
+%{
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
+  ins_encode %{
+    __ string_compare($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register, $result$$Register,
+                      $tmp1$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 // ============================================================================
 // This name is KNOWN by the ADLC and cannot be changed.
 // The ADLC forces a 'TypeRawPtr::BOTTOM' output type
diff -r 3c01fe371d7c -r 845014e20c17 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 16:47:59 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jul 15 18:03:56 2014 +0100
@@ -3354,3 +3354,87 @@
   }
 }
 
+// Compare strings.
+void MacroAssembler::string_compare(Register str1, Register str2,
+                                    Register cnt1, Register cnt2, Register result,
+                                    Register tmp1) {
+  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
+    NEXT_WORD, DIFFERENCE;
+
+  BLOCK_COMMENT("string_compare {");
+
+  // Compute the minimum of the string lengths and save the difference.
+  subsw(tmp1, cnt1, cnt2);
+  cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
+
+  // A very short string
+  cmpw(cnt2, 4);
+  br(Assembler::LT, SHORT_STRING);
+
+  // Check if the strings start at the same location.
+  cmp(str1, str2);
+  br(Assembler::EQ, LENGTH_DIFF);
+
+  // Compare longwords
+  {
+    subw(cnt2, cnt2, 4); // The last longword is a special case
+
+    // Move both string pointers to the last longword of their
+    // strings, negate the remaining count, and convert it to bytes.
+    lea(str1, Address(str1, cnt2, Address::uxtw(1)));
+    lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+    sub(cnt2, zr, cnt2, LSL, 1);
+
+    // Loop, loading longwords and comparing them into rscratch2.
+    bind(NEXT_WORD);
+    ldr(result, Address(str1, cnt2));
+    ldr(cnt1, Address(str2, cnt2));
+    adds(cnt2, cnt2, wordSize);
+    eor(rscratch2, result, cnt1);
+    cbnz(rscratch2, DIFFERENCE);
+    br(Assembler::LT, NEXT_WORD);
+
+    // Last longword.  In the case where length == 4 we compare the
+    // same longword twice, but that's still faster than another
+    // conditional branch.
+
+    ldr(result, Address(str1));
+    ldr(cnt1, Address(str2));
+    eor(rscratch2, result, cnt1);
+    cbz(rscratch2, LENGTH_DIFF);
+
+    // Find the first different characters in the longwords and
+    // compute their difference.
+    bind(DIFFERENCE);
+    rev(rscratch2, rscratch2);
+    clz(rscratch2, rscratch2);
+    andr(rscratch2, rscratch2, -16);
+    lsrv(result, result, rscratch2);
+    uxthw(result, result);
+    lsrv(cnt1, cnt1, rscratch2);
+    uxthw(cnt1, cnt1);
+    subw(result, result, cnt1);
+    b(DONE);
+  }
+
+  bind(SHORT_STRING);
+  // Is the minimum length zero?
+  cbz(cnt2, LENGTH_DIFF);
+
+  bind(SHORT_LOOP);
+  load_unsigned_short(result, Address(post(str1, 2)));
+  load_unsigned_short(cnt1, Address(post(str2, 2)));
+  subw(result, result, cnt1);
+  cbnz(result, DONE);
+  sub(cnt2, cnt2, 1);
+  cbnz(cnt2, SHORT_LOOP);
+
+  // Strings are equal up to min length.  Return the length difference.
+  bind(LENGTH_DIFF);
+  mov(result, tmp1);
+
+  // That's it
+  bind(DONE);
+
+  BLOCK_COMMENT("} string_compare");
+}
diff -r 3c01fe371d7c -r 845014e20c17 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jul 15 16:47:59 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jul 15 18:03:56 2014 +0100
@@ -1281,6 +1281,10 @@
   void update_word_crc32(Register crc, Register v, Register tmp,
         Register table0, Register table1, Register table2, Register table3,
         bool upper = false);
+
+  void string_compare(Register str1, Register str2,
+                                    Register cnt1, Register cnt2, Register result,
+                                    Register tmp1);
 };
 
 // Used by aarch64.ad to control code generation
-------------- next part --------------
# HG changeset patch
# User aph
# Date 1404389999 -3600
# Node ID 5e653c9bf2aa9baa4ed326c9be7c4233462144ea
# Parent  1d342713037a081e2ca5f4f5093041d2c00018b0
Fast String.equals()

diff -r 1d342713037a -r 5e653c9bf2aa src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Fri Jun 27 11:25:47 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Jul 03 13:19:59 2014 +0100
@@ -11348,6 +11348,21 @@
   ins_pipe(pipe_class_memory);
 %}
 
+instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
+                        iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
+%{
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(KILL tmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr);
+
+  format %{ "String Equals $str1,$str2,$cnt -> $result    // KILL $tmp" %}
+  ins_encode %{
+    __ string_equals($str1$$Register, $str2$$Register,
+		      $cnt$$Register, $result$$Register,
+                      $tmp$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 // ============================================================================
 // This name is KNOWN by the ADLC and cannot be changed.
 // The ADLC forces a 'TypeRawPtr::BOTTOM' output type
diff -r 1d342713037a -r 5e653c9bf2aa src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Fri Jun 27 11:25:47 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jul 03 13:19:59 2014 +0100
@@ -3436,3 +3436,78 @@
 
   BLOCK_COMMENT("} string_compare");
 }
+
+
+void MacroAssembler::string_equals(Register str1, Register str2,
+				   Register cnt, Register result,
+				   Register tmp1) {
+  Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
+    NEXT_WORD;
+
+  const Register tmp2 = rscratch1;
+  assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
+
+  BLOCK_COMMENT("string_equals {");
+
+  // Start by assuming that the strings are not equal.
+  mov(result, zr);
+
+  // A very short string
+  cmpw(cnt, 4);
+  br(Assembler::LT, SHORT_STRING);
+
+  // Check if the strings start at the same location.
+  cmp(str1, str2);
+  br(Assembler::EQ, SAME_CHARS);
+
+  // Compare longwords
+  {
+    subw(cnt, cnt, 4); // The last longword is a special case
+
+    // Move both string pointers to the last longword of their
+    // strings, negate the remaining count, and convert it to bytes.
+    lea(str1, Address(str1, cnt, Address::uxtw(1)));
+    lea(str2, Address(str2, cnt, Address::uxtw(1)));
+    sub(cnt, zr, cnt, LSL, 1);
+
+    // Loop, loading longwords and comparing them into rscratch2.
+    bind(NEXT_WORD);
+    ldr(tmp1, Address(str1, cnt));
+    ldr(tmp2, Address(str2, cnt));
+    adds(cnt, cnt, wordSize);
+    eor(rscratch2, tmp1, tmp2);
+    cbnz(rscratch2, DONE);
+    br(Assembler::LT, NEXT_WORD);
+
+    // Last longword.  In the case where length == 4 we compare the
+    // same longword twice, but that's still faster than another
+    // conditional branch.
+
+    ldr(tmp1, Address(str1));
+    ldr(tmp2, Address(str2));
+    eor(rscratch2, tmp1, tmp2);
+    cbz(rscratch2, SAME_CHARS);
+    b(DONE);
+  }
+
+  bind(SHORT_STRING);
+  // Is the length zero?
+  cbz(cnt, SAME_CHARS);
+
+  bind(SHORT_LOOP);
+  load_unsigned_short(tmp1, Address(post(str1, 2)));
+  load_unsigned_short(tmp2, Address(post(str2, 2)));
+  subw(tmp1, tmp1, tmp2);
+  cbnz(tmp1, DONE);
+  sub(cnt, cnt, 1);
+  cbnz(cnt, SHORT_LOOP);
+
+  // Strings are equal.
+  bind(SAME_CHARS);
+  mov(result, true);
+
+  // That's it
+  bind(DONE);
+
+  BLOCK_COMMENT("} string_equals");
+}
diff -r 1d342713037a -r 5e653c9bf2aa src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri Jun 27 11:25:47 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jul 03 13:19:59 2014 +0100
@@ -1292,8 +1292,11 @@
         bool upper = false);
 
   void string_compare(Register str1, Register str2,
-                                    Register cnt1, Register cnt2, Register result,
-                                    Register tmp1);
+		      Register cnt1, Register cnt2, Register result,
+		      Register tmp1);
+  void string_equals(Register str1, Register str2,
+		     Register cnt, Register result,
+		     Register tmp1);
 };
 
 // Used by aarch64.ad to control code generation
-------------- next part --------------
# HG changeset patch
# User Edward Nevill edward.nevill at linaro.org
# Date 1405935959 -3600
# Node ID 8d8a08c32db742701ac1ab50b59709893608e406
# Parent  2e9160c1f26da5345f288120da1b6b0842125144
Add support for a few simple intrinsics

diff -r 2e9160c1f26d -r 8d8a08c32db7 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Jul 03 13:19:59 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Jul 21 10:45:59 2014 +0100
@@ -5938,6 +5938,61 @@
 %}
 
 // ============================================================================
+// Zero Count Instructions
+
+instruct countLeadingZerosI(iRegI dst, iRegI src) %{
+  match(Set dst (CountLeadingZerosI src));
+
+  ins_cost(INSN_COST);
+  format %{ "clzw  $dst, $src" %}
+  ins_encode %{
+    __ clzw(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe( pipe_class_default );
+%}
+
+instruct countLeadingZerosL(iRegI dst, iRegL src) %{
+  match(Set dst (CountLeadingZerosL src));
+
+  ins_cost(INSN_COST);
+  format %{ "clz   $dst, $src" %}
+  ins_encode %{
+    __ clz(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe( pipe_class_default );
+%}
+
+instruct countTrailingZerosI(iRegI dst, iRegI src) %{
+  match(Set dst (CountTrailingZerosI src));
+
+  ins_cost(INSN_COST * 2);
+  format %{ "rbitw  $dst, $src\n\t"
+            "clzw   $dst, $dst" %}
+  ins_encode %{
+    __ rbitw(as_Register($dst$$reg), as_Register($src$$reg));
+    __ clzw(as_Register($dst$$reg), as_Register($dst$$reg));
+  %}
+
+  ins_pipe( pipe_class_default );
+%}
+
+instruct countTrailingZerosL(iRegI dst, iRegL src) %{
+  match(Set dst (CountTrailingZerosL src));
+
+  ins_cost(INSN_COST * 2);
+  format %{ "rbit   $dst, $src\n\t"
+            "clz    $dst, $dst" %}
+  ins_encode %{
+    __ rbit(as_Register($dst$$reg), as_Register($src$$reg));
+    __ clz(as_Register($dst$$reg), as_Register($dst$$reg));
+  %}
+
+  ins_pipe( pipe_class_default );
+%}
+
+// ============================================================================
 // MemBar Instruction
 
 instruct membar_acquire()
@@ -9693,6 +9748,32 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct sqrtD_reg(vRegD dst, vRegD src) %{
+  match(Set dst (SqrtD src));
+
+  ins_cost(INSN_COST * 50);
+  format %{ "fsqrtd  $dst, $src" %}
+  ins_encode %{
+    __ fsqrtd(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src$$reg));
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+instruct sqrtF_reg(vRegF dst, vRegF src) %{
+  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
+
+  ins_cost(INSN_COST * 50);
+  format %{ "fsqrts  $dst, $src" %}
+  ins_encode %{
+    __ fsqrts(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src$$reg));
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
 // ============================================================================
 // Logical Instructions
 
-------------- next part --------------
# HG changeset patch
# User aph
# Date 1404822591 14400
# Node ID 5ed1bb528b990f293f6abbef834f7c4bf0dea406
# Parent  2a489b2bb083062d3356ee6c470aaf4d2d0a481d
AArch64 C2 instruct for smull

diff -r 2a489b2bb083 -r 5ed1bb528b99 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 08 05:25:15 2014 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 08 08:29:51 2014 -0400
@@ -7471,6 +7471,21 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct smulI(iRegLNoSp dst, iRegIorL2I src1, iRegIorL2I src2) %{
+  match(Set dst (MulL (ConvI2L src1) (ConvI2L src2)));
+
+  ins_cost(INSN_COST * 3);
+  format %{ "smull  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ smull(as_Register($dst$$reg),
+	     as_Register($src1$$reg),
+	     as_Register($src2$$reg));
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
 // Long Multiply
 
 instruct mulL(iRegLNoSp dst, iRegL src1, iRegL src2) %{