[aarch64-port-dev ] System.arraycopy() intrinsics

Thu Sep 19 09:01:31 PDT 2013

We've been missing support for System.arraycopy intrinsics, so the
port has been falling back to native runtime calls.  This is a fairly
complete set: the only thing missing is checkcast_copy/ I have written
it but it isn't full tested yet so it's not included here.

I haven't included any of the intrinsics that are only used by C2
because I don't have any way to test them right now.

Andrew.



diff -r 423577eb8f6e -r 7c900775ce48 src/cpu/aarch64/vm/assembler_aarch64.hpp

--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Fri Sep 13 18:22:52 2013 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Sep 19 11:18:32 2013 +0100
@@ -613,16 +613,25 @@
 #endif

 public:
+  enum { instruction_size = 4 };
+
+  Address adjust(Register base, int offset, bool preIncrement) {
+    if (preIncrement)
+      return Address(Pre(base, offset));
+    else
+      return Address(Post(base, offset));
+  }
+
   Address pre(Register base, int offset) {
-    return Address(Pre(base, offset));
+    return adjust(base, offset, true);
   }

   Address post (Register base, int offset) {
-    return Address(Post(base, offset));
+    return adjust(base, offset, false);
   }

   Instruction_aarch64* current;
-public:
+
   void set_current(Instruction_aarch64* i) { current = i; }

   void f(unsigned val, int msb, int lsb) {
diff -r 423577eb8f6e -r 7c900775ce48 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Fri Sep 13 18:22:52 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Thu Sep 19 11:18:32 2013 +0100
@@ -2177,7 +2177,8 @@
   if (basic_type == T_ARRAY) basic_type = T_OBJECT;

   // if we don't know anything, just go through the generic arraycopy
-  if (default_type == NULL || (basic_type == T_OBJECT && UseCompressedOops)) {
+  if (default_type == NULL // || basic_type == T_OBJECT
+      ) {
     Label done;
     assert(src == r1 && src_pos == r2, "mismatch in calling convention");

@@ -2488,7 +2489,13 @@
   bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
   const char *name;
   address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);
-  __ call_VM_leaf(entry, 3);
+
+ CodeBlob *cb = CodeCache::find_blob(entry);
+ if (cb) {
+   __ bl(RuntimeAddress(entry));
+ } else {
+   __ call_VM_leaf(entry, 3);
+ }

   __ bind(*stub->continuation());
 }
diff -r 423577eb8f6e -r 7c900775ce48 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri Sep 13 18:22:52 2013 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Sep 19 11:18:32 2013 +0100
@@ -1285,12 +1285,13 @@
   void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);

   void tableswitch(Register index, jint lowbound, jint highbound,
-		   Label &jumptable, Label &jumptable_end) {
+		   Label &jumptable, Label &jumptable_end, int stride = 1) {
     adr(rscratch1, jumptable);
     subsw(rscratch2, index, lowbound);
     subsw(zr, rscratch2, highbound - lowbound);
     br(Assembler::HS, jumptable_end);
-    add(rscratch1, rscratch1, rscratch2, ext::sxtw, 2);
+    add(rscratch1, rscratch1, rscratch2,
+	ext::sxtw, exact_log2(stride * Assembler::instruction_size));
     br(rscratch1);
   }

diff -r 423577eb8f6e -r 7c900775ce48 src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri Sep 13 18:22:52 2013 +0100
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Sep 19 11:18:32 2013 +0100
@@ -49,9 +49,8 @@
 // For a more detailed description of the stub routine structure
 // see the comment in stubRoutines.hpp

+#undef __
 #define __ _masm->
-#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
-#define a__ ((Assembler*)_masm)->

 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
@@ -826,7 +825,40 @@
   //
   //     Destroy no registers!
   //
-  void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { Unimplemented(); }
+  void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
+    BarrierSet* bs = Universe::heap()->barrier_set();
+    switch (bs->kind()) {
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      // With G1, don't generate the call if we statically know that the target in uninitialized
+      if (!dest_uninitialized) {
+	__ push(0x3fffffff, sp);         // integer registers except lr & sp
+	if (count == c_rarg0) {
+	  if (addr == c_rarg1) {
+	    // exactly backwards!!
+	    __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
+	    __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
+	  } else {
+	    __ mov(c_rarg1, count);
+	    __ mov(c_rarg0, addr);
+	  }
+	} else {
+	  __ mov(c_rarg0, addr);
+	  __ mov(c_rarg1, count);
+	}
+	__ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
+	__ pop(0x3fffffff, sp);         // integer registers except lr & sp        }
+	break;
+      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableExtension:
+      case BarrierSet::ModRef:
+        break;
+      default:
+        ShouldNotReachHere();
+
+      }
+    }
+  }

   //
   // Generate code for an array write post barrier
@@ -838,38 +870,323 @@
   //
   //  The input registers are overwritten.
   //  The ending address is inclusive.
-  void  gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { Unimplemented(); }
+  void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
+    assert_different_registers(start, end, scratch);
+    BarrierSet* bs = Universe::heap()->barrier_set();
+    switch (bs->kind()) {
+      case BarrierSet::G1SATBCT:
+      case BarrierSet::G1SATBCTLogging:

+        {
+	  __ push(0x3fffffff, sp);         // integer registers except lr & sp
+          // must compute element count unless barrier set interface is changed (other platforms supply count)
+          assert_different_registers(start, end, scratch);
+          __ lea(scratch, Address(end, BytesPerHeapOop));
+          __ sub(scratch, scratch, start);               // subtract start to get #bytes
+          __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
+          __ mov(c_rarg0, start);
+          __ mov(c_rarg1, scratch);
+          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
+	  __ pop(0x3fffffff, sp);         // integer registers except lr & sp        }
+        }
+        break;
+      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableExtension:
+        {
+          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

-  // Copy big chunks forward
+          Label L_loop;
+
+           __ lsr(start, start, CardTableModRefBS::card_shift);
+           __ add(end, end, BytesPerHeapOop);
+           __ lsr(end, end, CardTableModRefBS::card_shift);
+           __ sub(end, end, start); // number of bytes to copy
+
+          const Register count = end; // 'end' register contains bytes count now
+	  __ mov(scratch, (address)ct->byte_map_base);
+          __ add(start, start, scratch);
+	  __ BIND(L_loop);
+	  __ strb(zr, Address(start, count));
+          __ subs(count, count, 1);
+          __ br(Assembler::HI, L_loop);
+        }
+        break;
+      default:
+        ShouldNotReachHere();
+
+    }
+  }
+
+  typedef enum {
+    copy_forwards = 1,
+    copy_backwards = -1
+  } copy_direction;
+
+  void copy_longs_small(Register s, Register d, Register count, copy_direction direction) {
+    Label again, around;
+    __ cbz(count, around);
+    __ bind(again);
+    __ ldr(r16, Address(__ adjust(s, wordSize * direction, direction == copy_backwards)));
+    __ str(r16, Address(__ adjust(d, wordSize * direction, direction == copy_backwards)));
+    __ sub(count, count, 1);
+    __ cbnz(count, again);
+    __ bind(around);
+  }
+
+  void copy_longs(Register s, Register d, Register count, Register tmp, copy_direction direction) {
+    __ andr(tmp, count, 3);
+    copy_longs_small(s, d, tmp, direction);
+    __ andr(count, count, -4);
+
+    Label again, around;
+    __ cbz(count, around);
+    __ push(r18->bit() | r19->bit(), sp);
+    __ bind(again);
+    if (direction != copy_backwards) {
+      __ prfm(Address(s, direction == copy_forwards ? 4 * wordSize : -6 * wordSize));
+      __ prfm(Address(s, direction == copy_forwards ? 6 * wordSize : -8 * wordSize));
+    }
+    __ ldp(r16, r17, Address(__ adjust(s, 2 * wordSize * direction, direction == copy_backwards)));
+    __ ldp(r18, r19, Address(__ adjust(s, 2 * wordSize * direction, direction == copy_backwards)));
+    __ stp(r16, r17, Address(__ adjust(d, 2 * wordSize * direction, direction == copy_backwards)));
+    __ stp(r18, r19, Address(__ adjust(d, 2 * wordSize * direction, direction == copy_backwards)));
+    __ subs(count, count, 4);
+    __ cbnz(count, again);
+    __ pop(r18->bit() | r19->bit(), sp);
+    __ bind(around);
+  }
+
+  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step, Label &done) {
+    // Small copy: less than one word.
+    bool is_backwards = step < 0;
+    int granularity = abs(step);
+
+    __ cbz(count, done);
+    {
+      Label loop;
+      __ bind(loop);
+      switch (granularity) {
+      case 1:
+	__ ldrb(tmp, Address(__ adjust(s, step, is_backwards)));
+	__ strb(tmp, Address(__ adjust(d, step, is_backwards)));
+	break;
+      case 2:
+	__ ldrh(tmp, Address(__ adjust(s, step, is_backwards)));
+	__ strh(tmp, Address(__ adjust(d, step, is_backwards)));
+	break;
+      case 4:
+	__ ldrw(tmp, Address(__ adjust(s, step, is_backwards)));
+	__ strw(tmp, Address(__ adjust(d, step, is_backwards)));
+	break;
+      default:
+	assert(false, "copy_memory called with impossible step");
+      }
+      __ sub(count, count, 1);
+      __ cbnz(count, loop);
+      __ b(done);
+    }
+  }
+
+  // All-singing all-dancing memory copy.
+  //
+  // Copy count units of memory from s to d.  The size of a unit is
+  // step, which can be positive or negative depending on the direction
+  // of copy.  If is_aligned is false, we align the source address.
+  //
+
+  void copy_memory(bool is_aligned, Register s, Register d,
+		   Register count, Register tmp, int step) {
+    copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
+    bool is_backwards = step < 0;
+    int granularity = abs(step);
+
+    if (is_backwards) {
+      __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
+      __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
+    }
+
+    if (granularity == wordSize) {
+      copy_longs(c_rarg0, c_rarg1, c_rarg2, rscratch1, direction);
+      return;
+    }
+
+    Label done, large;
+
+    if (! is_aligned) {
+      __ cmp(count, wordSize/granularity);
+      __ br(Assembler::HS, large);
+      copy_memory_small(s, d, count, tmp, step, done);
+      __ bind(large);
+
+      // Now we've got the small case out of the way we can align the
+      // source address.
+      {
+	Label skip1, skip2, skip4;
+
+	switch (granularity) {
+	case 1:
+	  __ tst(s, 1);
+	  __ br(Assembler::EQ, skip1);
+	  __ ldrb(tmp, Address(__ adjust(s, direction, is_backwards)));
+	  __ strb(tmp, Address(__ adjust(d, direction, is_backwards)));
+	  __ sub(count, count, 1);
+	  __ bind(skip1);
+	  // fall through
+	case 2:
+	  __ tst(s, 2/granularity);
+	  __ br(Assembler::EQ, skip2);
+	  __ ldrh(tmp, Address(__ adjust(s, 2 * direction, is_backwards)));
+	  __ strh(tmp, Address(__ adjust(d, 2 * direction, is_backwards)));
+	  __ sub(count, count, 2/granularity);
+	  __ bind(skip2);
+	  // fall through
+	case 4:
+	  __ tst(s, 4/granularity);
+	  __ br(Assembler::EQ, skip4);
+	  __ ldrw(tmp, Address(__ adjust(s, 4 * direction, is_backwards)));
+	  __ strw(tmp, Address(__ adjust(d, 4 * direction, is_backwards)));
+	  __ sub(count, count, 4/granularity);
+	  __ bind(skip4);
+	}
+      }
+    }
+
+    // s is now word-aligned.
+
+    // We have a count of units and some trailing bytes.  Adjust the
+    // count and do a bulk copy of words.
+    __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
+    __ sub(count, count, rscratch2, Assembler::LSL, exact_log2(wordSize/granularity));
+
+    copy_longs(s, d, rscratch2, rscratch1, direction);
+
+    // And the tail.
+
+    copy_memory_small(s, d, count, tmp, step, done);
+
+    __ bind(done);
+  }
+
+  // Scan over array at d for count oops, verifying each one.
+  // Preserves d and count, clobbers rscratch1 and rscratch2.
+  void verify_oop_array (size_t size, Register d, Register count, Register temp) {
+    Label loop, end;
+    __ mov(rscratch1, d);
+    __ mov(rscratch2, zr);
+    __ bind(loop);
+    __ cmp(rscratch2, count);
+    __ br(Assembler::HS, end);
+    if (size == (size_t)wordSize) {
+      __ ldr(temp, Address(d, rscratch2, Address::uxtw(exact_log2(size))));
+      __ verify_oop(temp);
+    } else {
+      __ ldrw(r16, Address(d, rscratch2, Address::uxtw(exact_log2(size))));
+      __ decode_heap_oop(temp); // calls verify_oop
+    }
+    __ add(rscratch2, rscratch2, size);
+    __ b(loop);
+    __ bind(end);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   is_oop  - true => oop array, so generate store check code
+  //   name    - stub name string
   //
   // Inputs:
-  //   end_from     - source arrays end address
-  //   end_to       - destination array end address
-  //   qword_count  - 64-bits element count, negative
-  //   to           - scratch
-  //   L_copy_32_bytes - entry label
-  //   L_copy_8_bytes  - exit  label
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
   //
-  void copy_32_bytes_forward(Register end_from, Register end_to,
-                             Register qword_count, Register to,
-                             Label& L_copy_32_bytes, Label& L_copy_8_bytes) { Unimplemented(); }
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  // Side Effects:
+  //   disjoint_int_copy_entry is set to the no-overlap entry point
+  //   used by generate_conjoint_int_oop_copy().
+  //
+  address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
+				  const char *name, bool dest_uninitialized = false) {
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+    if (entry != NULL) {
+      *entry = __ pc();
+      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
+      BLOCK_COMMENT("Entry:");
+    }
+    __ enter();
+    __ push(r16->bit() | r17->bit(), sp);
+    if (is_oop) {
+      __ push(d->bit() | count->bit(), sp);
+      // no registers are destroyed by this call
+      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
+    }
+    copy_memory(aligned, s, d, count, rscratch1, size);
+    if (is_oop) {
+      __ pop(d->bit() | count->bit(), sp);
+      if (VerifyOops)
+	verify_oop_array(size, s, d, count, r16);
+      __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
+      gen_write_ref_array_post_barrier(d, count, rscratch1);
+    }
+    __ pop(r16->bit() | r17->bit(), sp);
+    __ leave();
+    __ ret(lr);
+    return start;
+  }

-
-  // Copy big chunks backward
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   is_oop  - true => oop array, so generate store check code
+  //   name    - stub name string
   //
   // Inputs:
-  //   from         - source arrays address
-  //   dest         - destination array address
-  //   qword_count  - 64-bits element count
-  //   to           - scratch
-  //   L_copy_32_bytes - entry label
-  //   L_copy_8_bytes  - exit  label
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
   //
-  void copy_32_bytes_backward(Register from, Register dest,
-                              Register qword_count, Register to,
-                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) { Unimplemented(); }
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
+				 address *entry, const char *name,
+				 bool dest_uninitialized = false) {
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;

+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    __ cmp(d, s);
+    __ br(Assembler::LS, nooverlap_target);
+
+    __ enter();
+    __ push(r16->bit() | r17->bit(), sp);
+    if (is_oop) {
+      __ push(d->bit() | count->bit(), sp);
+      // no registers are destroyed by this call
+      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
+    }
+    copy_memory(aligned, s, d, count, rscratch1, -size);
+    if (is_oop) {
+      __ pop(d->bit() | count->bit(), sp);
+      if (VerifyOops)
+	verify_oop_array(size, s, d, count, r16);
+      __ lea(c_rarg2, Address(c_rarg1, c_rarg2, Address::uxtw(exact_log2(size))));
+      gen_write_ref_array_post_barrier(c_rarg1, c_rarg2, rscratch1);
+    }
+    __ pop(r16->bit() | r17->bit(), sp);
+    __ leave();
+    __ ret(lr);
+
+    return start;
+}

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
@@ -887,10 +1204,19 @@
   // and stored atomically.
   //
   // Side Effects:
+  //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
+  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
+  // we let the hardware handle it.  The one to eight bytes within words,
+  // dwords or qwords that span cache line boundaries will still be loaded
+  // and stored atomically.
+  //
+  // Side Effects:
   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   //   used by generate_conjoint_byte_copy().
   //
-  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { Unimplemented(); return 0; }
+  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
+    return generate_disjoint_copy(sizeof (jbyte), aligned, /*is_oop*/false, entry, name);
+  }

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
@@ -908,7 +1234,9 @@
   // and stored atomically.
   //
   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
-                                      address* entry, const char *name) { Unimplemented(); return 0; }
+                                      address* entry, const char *name) {
+    return generate_conjoint_copy(sizeof (jbyte), aligned, /*is_oop*/false, nooverlap_target, entry, name);
+  }

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
@@ -929,9 +1257,10 @@
   //   disjoint_short_copy_entry is set to the no-overlap entry point
   //   used by generate_conjoint_short_copy().
   //
-  address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { Unimplemented(); return 0; }
-
-  address generate_fill(BasicType t, bool aligned, const char *name) { Unimplemented(); return 0; }
+  address generate_disjoint_short_copy(bool aligned,
+				       address* entry, const char *name) {
+    return generate_disjoint_copy(sizeof (jshort), aligned, /*is_oop*/false, entry, name);
+  }

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
@@ -949,12 +1278,13 @@
   // and stored atomically.
   //
   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
-                                       address *entry, const char *name) { Unimplemented(); return 0; }
+                                       address *entry, const char *name) {
+    return generate_conjoint_copy(sizeof (jshort), aligned, /*is_oop*/false, nooverlap_target, entry, name);

+  }
   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   //             ignored
-  //   is_oop  - true => oop array, so generate store check code
   //   name    - stub name string
   //
   // Inputs:
@@ -970,13 +1300,15 @@
   //   disjoint_int_copy_entry is set to the no-overlap entry point
   //   used by generate_conjoint_int_oop_copy().
   //
-  address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
-                                         const char *name, bool dest_uninitialized = false) { Unimplemented(); return 0; }
+  address generate_disjoint_int_copy(bool aligned, address *entry,
+					 const char *name, bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
+  }

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   //             ignored
-  //   is_oop  - true => oop array, so generate store check code
   //   name    - stub name string
   //
   // Inputs:
@@ -988,42 +1320,93 @@
   // the hardware handle it.  The two dwords within qwords that span
   // cache line boundaries will still be loaded and stored atomicly.
   //
-  address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
-                                         address *entry, const char *name,
-                                         bool dest_uninitialized = false) { Unimplemented(); return 0; }
+  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
+				     address *entry, const char *name,
+				     bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
+  }
+

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
   //             ignored
-  //   is_oop  - true => oop array, so generate store check code
   //   name    - stub name string
   //
   // Inputs:
   //   c_rarg0   - source array address
   //   c_rarg1   - destination array address
-  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //   c_rarg2   - element count, treated as size_t, can be zero
   //
- // Side Effects:
+  // Side Effects:
   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
   //
-  address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
-                                          const char *name, bool dest_uninitialized = false) { Unimplemented(); return 0; }
+  address generate_disjoint_long_copy(bool aligned, address *entry,
+                                          const char *name, bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
+  }

   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
   //             ignored
-  //   is_oop  - true => oop array, so generate store check code
   //   name    - stub name string
   //
   // Inputs:
   //   c_rarg0   - source array address
   //   c_rarg1   - destination array address
-  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //   c_rarg2   - element count, treated as size_t, can be zero
   //
-  address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
-                                          address nooverlap_target, address *entry,
-                                          const char *name, bool dest_uninitialized = false) { Unimplemented(); return 0; }
+  address generate_conjoint_long_copy(bool aligned,
+				      address nooverlap_target, address *entry,
+				      const char *name, bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as size_t, can be zero
+  //
+  // Side Effects:
+  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
+  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
+  //
+  address generate_disjoint_oop_copy(bool aligned, address *entry,
+				     const char *name, bool dest_uninitialized = false) {
+    const bool is_oop = true;
+    if (UseCompressedOops)
+      return generate_disjoint_copy(sizeof (jint), aligned, is_oop, entry, name);
+    else
+      return generate_disjoint_copy(sizeof (jlong), aligned, is_oop, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as size_t, can be zero
+  //
+  address generate_conjoint_oop_copy(bool aligned,
+				     address nooverlap_target, address *entry,
+				     const char *name, bool dest_uninitialized = false) {
+    const bool is_oop = true;
+    if (UseCompressedOops)
+      return generate_conjoint_copy(sizeof (jint), aligned, is_oop, nooverlap_target, entry, name);
+    else
+      return generate_conjoint_copy(sizeof (jlong), aligned, is_oop, nooverlap_target, entry, name);
+  }


   // Helper for generating a dynamic type check.
@@ -1110,60 +1493,91 @@
     assert(count == 0, "huh?");
   }

+
   void generate_arraycopy_stubs() {
-    // Call the conjoint generation methods immediately after
-    // the disjoint ones so that short branches from the former
-    // to the latter can be generated.
-#if 0
-    StubRoutines::_jbyte_disjoint_arraycopy  = (address) fake_arraycopy_stub;
-    StubRoutines::_jbyte_arraycopy           = (address) fake_arraycopy_stub;
+    address entry;
+    address entry_jbyte_arraycopy;
+    address entry_jshort_arraycopy;
+    address entry_jint_arraycopy;
+    address entry_oop_arraycopy;
+    address entry_jlong_arraycopy;
+    address entry_checkcast_arraycopy;

-    StubRoutines::_jshort_disjoint_arraycopy = (address) fake_arraycopy_stub;
-    StubRoutines::_jshort_arraycopy          = (address) fake_arraycopy_stub;
+    //*** jbyte
+    // Always need aligned and unaligned versions
+    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
+                                                                                  "jbyte_disjoint_arraycopy");
+    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
+                                                                                  &entry_jbyte_arraycopy,
+                                                                                  "jbyte_arraycopy");
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
+                                                                                  "arrayof_jbyte_disjoint_arraycopy");
+    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
+                                                                                  "arrayof_jbyte_arraycopy");

-    StubRoutines::_jint_disjoint_arraycopy   = (address) fake_arraycopy_stub;
-    StubRoutines::_jint_arraycopy            = (address) fake_arraycopy_stub;
+    //*** jshort
+    // Always need aligned and unaligned versions
+    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
+                                                                                    "jshort_disjoint_arraycopy");
+    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
+                                                                                    &entry_jshort_arraycopy,
+                                                                                    "jshort_arraycopy");
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
+                                                                                    "arrayof_jshort_disjoint_arraycopy");
+    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
+                                                                                    "arrayof_jshort_arraycopy");

-    StubRoutines::_jlong_disjoint_arraycopy  = (address) fake_arraycopy_stub;
-    StubRoutines::_jlong_arraycopy           = (address) fake_arraycopy_stub;
-#endif
+    //*** jint
+    // Aligned versions
+    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
+										"arrayof_jint_disjoint_arraycopy");
+    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
+										"arrayof_jint_arraycopy");
+    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
+    // entry_jint_arraycopy always points to the unaligned version
+    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
+										"jint_disjoint_arraycopy");
+    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
+										&entry_jint_arraycopy,
+										"jint_arraycopy");
+
+    //*** jlong
+    // It is always aligned
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
+										  "arrayof_jlong_disjoint_arraycopy");
+    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
+										  "arrayof_jlong_arraycopy");
+    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
+
+    //*** oops
+    {
+      // With compressed oops we need unaligned versions; notice that
+      // we overwrite entry_oop_arraycopy.
+      bool aligned = !UseCompressedOops;
+
+      StubRoutines::_arrayof_oop_disjoint_arraycopy
+	= generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
+      StubRoutines::_arrayof_oop_arraycopy
+	= generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
+      // Aligned versions without pre-barriers
+      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
+	= generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
+				     /*dest_uninitialized*/true);
+      StubRoutines::_arrayof_oop_arraycopy_uninit
+	= generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
+				     /*dest_uninitialized*/true);
+    }
+
+    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;

 #if 0
-    StubRoutines::_oop_disjoint_arraycopy    = ShouldNotCallThisStub();
-    StubRoutines::_oop_arraycopy             = ShouldNotCallThisStub();
-
-    StubRoutines::_checkcast_arraycopy       = ShouldNotCallThisStub();
-    StubRoutines::_unsafe_arraycopy          = ShouldNotCallThisStub();
-    StubRoutines::_generic_arraycopy         = ShouldNotCallThisStub();
-#endif
-
-#if 0
-    // We don't generate specialized code for HeapWord-aligned source
-    // arrays, so just use the code we've already generated
-    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
-      StubRoutines::_jbyte_disjoint_arraycopy;
-    StubRoutines::_arrayof_jbyte_arraycopy =
-      StubRoutines::_jbyte_arraycopy;
-
-    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
-      StubRoutines::_jshort_disjoint_arraycopy;
-    StubRoutines::_arrayof_jshort_arraycopy =
-      StubRoutines::_jshort_arraycopy;
-
-    StubRoutines::_arrayof_jint_disjoint_arraycopy =
-      StubRoutines::_jint_disjoint_arraycopy;
-    StubRoutines::_arrayof_jint_arraycopy =
-      StubRoutines::_jint_arraycopy;
-
-    StubRoutines::_arrayof_jlong_disjoint_arraycopy =
-      StubRoutines::_jlong_disjoint_arraycopy;
-    StubRoutines::_arrayof_jlong_arraycopy =
-      StubRoutines::_jlong_arraycopy;
-
-    StubRoutines::_arrayof_oop_disjoint_arraycopy =
-      StubRoutines::_oop_disjoint_arraycopy;
-    StubRoutines::_arrayof_oop_arraycopy =
-      StubRoutines::_oop_arraycopy;
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
+    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
+                                                                        /*dest_uninitialized*/true);
 #endif
   }