[aarch64-port-dev ] Rewrite object initialization

Andrew Haley aph at redhat.com
Tue Nov 26 10:16:24 PST 2013


This replaces the object initialization code in C1 with the much
slicker code used by C2.

Andrew.


# HG changeset patch
# User aph
# Date 1385485750 0
# Node ID e9f4f09746dcd90b945525a3ed0abb9115dcca02
# Parent  3390e44a940f942ebdd245831951b09148248c51
Rewrite object initialization.

diff -r 3390e44a940f -r e9f4f09746dc src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Tue Nov 26 11:16:12 2013 +0000
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Tue Nov 26 17:09:10 2013 +0000
@@ -221,57 +221,46 @@
   block_comment("zero memory");
 #endif

-  Label finished;
+  Label loop;
+  Label entry;
+
+//  Algorithm:
+//
+//    scratch1 = cnt & 7;
+//    cnt -= scratch1;
+//    p += scratch1;
+//    switch (scratch1) {
+//      do {
+//        cnt -= 8;
+//        case 7:
+//          p[-7] = 0;
+//        case 6:
+//          p[-6] = 0;
+//          // ...
+//        case 1:
+//          p[-1] = 0;
+//        case 0:
+//          p += 8;
+//      } while (cnt);
+//    }
+
+  const int unroll = 8; // Number of str(zr) instructions we'll unroll

   lsr(len, len, LogBytesPerWord);
-  mov(rscratch1, addr);
-
-  // The algorithm first zeroes words until the number of words
-  // remaining is a multiple of 8, then enters a loop that writes 8
-  // words at a time.  The idea is to get small arrays done quickly
-  // because these are the common case.  Also, we don't want to bloat
-  // the VM with a lot of code: it's a compromise between speed and
-  // size.  We should really emit the large block out of line.
-
-  Label is_even;
-  tst(len, 1);
-  br(Assembler::EQ, is_even);
-  str(zr, Address(post(rscratch1, wordSize)));
-  sub(len, len, 1);
-  bind(is_even);
-
-  // len is now a multiple of 2
-
-  {
-    // Initialize the first few words.  This loop iterates at most 3
-    // times.
-
-    Label bottom, loop;
-    mov(t1, zr);
-    b(bottom);
-
-    bind(loop);
-    stp(zr, t1, Address(post(rscratch1, 2 * wordSize)));
-    sub(len, len, 2);
-    bind(bottom);
-    tst(len, 7);
-    br(NE, loop);
-  }
-
-  // len is now a multiple of 8
-
-  cbz(len, finished);
-
-  Label top;
-  bind(top);
-  stp(zr, t1, Address(post(rscratch1, 2 * wordSize)));
-  stp(zr, t1, Address(post(rscratch1, 2 * wordSize)));
-  stp(zr, t1, Address(post(rscratch1, 2 * wordSize)));
-  stp(zr, t1, Address(post(rscratch1, 2 * wordSize)));
-  sub(len, len, 8);
-  cbnz(len, top);
-
-  bind(finished);
+  andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
+  sub(len, len, rscratch1);      // cnt -= unroll
+  // t1 always points to the end of the region we're about to zero
+  add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
+  adr(rscratch2, entry);
+  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
+  br(rscratch2);
+  bind(loop);
+  sub(len, len, unroll);
+  for (int i = -unroll; i < 0; i++)
+    str(zr, Address(t1, i * wordSize));
+  bind(entry);
+  add(t1, t1, unroll * wordSize);
+  cbnz(len, loop);
 }

 // preserves obj, destroys len_in_bytes
@@ -323,7 +312,7 @@

   // clear rest of allocated space
   const Register index = t2;
-  const int threshold = 8 * BytesPerWord;   // approximate break even point for code size (see comments below)
+  const int threshold = 16 * BytesPerWord;   // approximate break even point for code size (see comments below)
   if (var_size_in_bytes != noreg) {
     mov(index, var_size_in_bytes);
     initialize_body(obj, index, hdr_size_in_bytes, t1);
@@ -334,44 +323,34 @@
       str(zr, Address(obj, i));
       i += BytesPerWord;
     }
-    if (i < con_size_in_bytes) {
-      mov(t1, zr);
-    }
     for (; i < con_size_in_bytes; i += 2 * BytesPerWord)
-      stp(zr, t1, Address(obj, i));
+      stp(zr, zr, Address(obj, i));
   } else if (con_size_in_bytes > hdr_size_in_bytes) {
     block_comment("zero memory");
     // use loop to null out the fields
-    // initialize last object field first if odd number of fields
+
     int words = (con_size_in_bytes - hdr_size_in_bytes) / BytesPerWord;
     mov(index,  words / 8);
-    lea(rscratch1, Address(obj, (con_size_in_bytes & - ((2 * BytesPerWord)-1))));
-    // initialize last object field if constant size is odd
-    if ((words % 2) != 0) {
-      str(zr, Address(obj, con_size_in_bytes - (1*BytesPerWord)));
-      words--;
+
+    const int unroll = 8; // Number of str(zr) instructions we'll unroll
+    int remainder = words % unroll;
+    lea(rscratch1, Address(obj, hdr_size_in_bytes + remainder * BytesPerWord));
+
+    Label entry_point, loop;
+    b(entry_point);
+
+    bind(loop);
+    sub(index, index, 1);
+    for (int i = -unroll; i < 0; i++) {
+      if (-i == remainder)
+	bind(entry_point);
+      str(zr, Address(rscratch1, i * wordSize));
     }
-    // initialize remaining object fields
-    mov(t1, zr);
-    {
-      Label top, entry_point;
+    if (remainder == 0)
+      bind(entry_point);
+    add(rscratch1, rscratch1, unroll * wordSize);
+    cbnz(index, loop);

-      int remainder = words % 8;
-      if (remainder != 0)
-	b(entry_point);
-
-      bind(top);
-      sub(index, index, 1);
-      stp(zr, t1, pre(rscratch1, -2 * BytesPerWord));
-      if (remainder == 6) bind(entry_point);
-      stp(zr, t1, pre(rscratch1, -2 * BytesPerWord));
-      if (remainder == 4) bind(entry_point);
-      stp(zr, t1, pre(rscratch1, -2 * BytesPerWord));
-      if (remainder == 2) bind(entry_point);
-      stp(zr, t1, pre(rscratch1, -2 * BytesPerWord));
-
-      cbnz(index, top);
-    }
   }

   if (CURRENT_ENV->dtrace_alloc_probes()) {



More information about the aarch64-port-dev mailing list