[aarch64-port-dev ] Optimize pushes and pops

Andrew Haley aph at redhat.com
Mon May 12 16:33:10 UTC 2014


This patch converts the old sequences of pre- and post-incremented
stores and loads into stores and loads without writeback.  This avoids
pipeline stalls with address dependencies.

After this patch we get:

  0x00007fffd10d1ff8: stp	x1, x2, [sp,#-48]!
  0x00007fffd10d1ffc: stp	x3, x4, [sp,#16]
  0x00007fffd10d2000: stp	x5, x6, [sp,#32]

and

  0x00007fffd10d202c: ldp	xscratch1, xmethod, [sp],#16
  0x00007fffd10d2030: ldp	x1, x2, [sp]
  0x00007fffd10d2034: ldp	x3, x4, [sp,#16]
  0x00007fffd10d2038: ldp	x5, x6, [sp,#32]
  0x00007fffd10d203c: add	sp, sp, #0x30

Andrew.


# HG changeset patch
# User aph
# Date 1399908399 -3600
#      Mon May 12 16:26:39 2014 +0100
# Node ID 3852a506a19bb79e0a77d8474978f09484fc3fed
# Parent  ac30fdebd5f5811d768d493d58d40852cff0886c
Tidy up stack frame handling.

diff -r ac30fdebd5f5 -r 3852a506a19b src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon May 12 14:34:00 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon May 12 16:26:39 2014 +0100
@@ -911,19 +911,16 @@
   if (C->need_stack_bang(framesize))
     __ generate_stack_overflow_check(framesize);

-  // push lr and rfp to create a frame
-  __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
-
-  // allow for already pushed values
-  framesize -= 2 * wordSize;
-
-  if (framesize) {
-    if (Assembler::operand_valid_for_add_sub_immediate(framesize)) {
-      __ sub(sp, sp, framesize);
-    } else {
-      __ mov(rscratch1, framesize);
-      __ sub(sp, sp, rscratch1);
-    }
+  if (framesize == 0) {
+    // Is this even possible?
+    __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
+  } else if (framesize < (1 << 12)) {
+    __ sub(sp, sp, framesize);
+    __ stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
+  } else {
+    __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
+    __ mov(rscratch1, framesize - 2 * wordSize);
+    __ sub(sp, sp, rscratch1);
   }

   if (NotifySimulator) {
@@ -993,19 +990,17 @@
   MacroAssembler _masm(&cbuf);
   int framesize = C->frame_slots() << LogBytesPerInt;

-  framesize -= 2 * wordSize;
-
-  if (framesize) {
-    if (Assembler::operand_valid_for_add_sub_immediate(framesize)) {
-      __ add(sp, sp, framesize);
-    } else {
-      __ mov(rscratch1, framesize);
-      __ add(sp, sp, rscratch1);
-    }
+  if (framesize == 0) {
+    __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
+  } else if (framesize < (1 << 12)) {
+    __ ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
+    __ add(sp, sp, framesize);
+  } else {
+    __ mov(rscratch1, framesize - 2 * wordSize);
+    __ add(sp, sp, rscratch1);
+    __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
   }

-  __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
-
   if (NotifySimulator) {
     __ notify(Assembler::method_reentry);
   }
diff -r ac30fdebd5f5 -r 3852a506a19b src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon May 12 14:34:00 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon May 12 16:26:39 2014 +0100
@@ -1665,7 +1665,7 @@

   // Scan bitset to accumulate register pairs
   unsigned char regs[32];
-  unsigned count = 0;
+  int count = 0;
   for (int reg = 0; reg <= 30; reg++) {
     if (1 & bitset)
       regs[count++] = reg;
@@ -1674,11 +1674,16 @@
   regs[count++] = zr->encoding_nocheck();
   count &= ~1;  // Only push an even nuber of regs

-  for (int i = count - 2; i >= 0; i-= 2) {
-    stp(as_Register(regs[i]), as_Register(regs[i+1]),
-	Address(pre(stack, -2 * wordSize)));
+  if (count) {
+    stp(as_Register(regs[0]), as_Register(regs[1]),
+	Address(pre(stack, -count * wordSize)));
     words_pushed += 2;
   }
+  for (int i = 2; i < count; i += 2) {
+    stp(as_Register(regs[i]), as_Register(regs[i+1]),
+	Address(stack, i * wordSize));
+    words_pushed += 2;
+  }

   return words_pushed;
 }
@@ -1688,7 +1693,7 @@

   // Scan bitset to accumulate register pairs
   unsigned char regs[32];
-  unsigned count = 0;
+  int count = 0;
   for (int reg = 0; reg <= 30; reg++) {
     if (1 & bitset)
       regs[count++] = reg;
@@ -1697,10 +1702,19 @@
   regs[count++] = zr->encoding_nocheck();
   count &= ~1;

-  for (unsigned i = 0; i < count; i+= 2) {
-    ldp(as_Register(regs[i]), as_Register(regs[i+1]),
-	Address(post(stack, 2 * wordSize)));
-    words_pushed += 2;
+  if (count <= 4) {
+    for (int i = 0; i < count; i+= 2) {
+      ldp(as_Register(regs[i]), as_Register(regs[i+1]),
+	  Address(post(stack, 2 * wordSize)));
+      words_pushed += 2;
+    }
+  } else {
+    for (int i = 0; i < count; i+= 2) {
+      ldp(as_Register(regs[i]), as_Register(regs[i+1]),
+	  Address(stack, i * wordSize));
+      words_pushed += 2;
+    }
+    add(stack, stack, words_pushed * wordSize);
   }

   return words_pushed;


More information about the aarch64-port-dev mailing list