[aarch64-port-dev ] Optimize pushes and pops
Andrew Haley
aph at redhat.com
Mon May 12 16:33:10 UTC 2014
This patch converts the old sequences of pre- and post-incremented
stores and loads into stores and loads without writeback. This avoids
pipeline stalls with address dependencies.
After this patch we get:
0x00007fffd10d1ff8: stp x1, x2, [sp,#-48]!
0x00007fffd10d1ffc: stp x3, x4, [sp,#16]
0x00007fffd10d2000: stp x5, x6, [sp,#32]
and
0x00007fffd10d202c: ldp xscratch1, xmethod, [sp],#16
0x00007fffd10d2030: ldp x1, x2, [sp]
0x00007fffd10d2034: ldp x3, x4, [sp,#16]
0x00007fffd10d2038: ldp x5, x6, [sp,#32]
0x00007fffd10d203c: add sp, sp, #0x30
Andrew.
# HG changeset patch
# User aph
# Date 1399908399 -3600
# Mon May 12 16:26:39 2014 +0100
# Node ID 3852a506a19bb79e0a77d8474978f09484fc3fed
# Parent ac30fdebd5f5811d768d493d58d40852cff0886c
Tidy up stack frame handling.
diff -r ac30fdebd5f5 -r 3852a506a19b src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad Mon May 12 14:34:00 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad Mon May 12 16:26:39 2014 +0100
@@ -911,19 +911,16 @@
if (C->need_stack_bang(framesize))
__ generate_stack_overflow_check(framesize);
- // push lr and rfp to create a frame
- __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
-
- // allow for already pushed values
- framesize -= 2 * wordSize;
-
- if (framesize) {
- if (Assembler::operand_valid_for_add_sub_immediate(framesize)) {
- __ sub(sp, sp, framesize);
- } else {
- __ mov(rscratch1, framesize);
- __ sub(sp, sp, rscratch1);
- }
+ if (framesize == 0) {
+ // Is this even possible?
+ __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
+ } else if (framesize < (1 << 12)) {
+ __ sub(sp, sp, framesize);
+ __ stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
+ } else {
+ __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
+ __ mov(rscratch1, framesize - 2 * wordSize);
+ __ sub(sp, sp, rscratch1);
}
if (NotifySimulator) {
@@ -993,19 +990,17 @@
MacroAssembler _masm(&cbuf);
int framesize = C->frame_slots() << LogBytesPerInt;
- framesize -= 2 * wordSize;
-
- if (framesize) {
- if (Assembler::operand_valid_for_add_sub_immediate(framesize)) {
- __ add(sp, sp, framesize);
- } else {
- __ mov(rscratch1, framesize);
- __ add(sp, sp, rscratch1);
- }
+ if (framesize == 0) {
+ __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
+ } else if (framesize < (1 << 12)) {
+ __ ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
+ __ add(sp, sp, framesize);
+ } else {
+ __ mov(rscratch1, framesize - 2 * wordSize);
+ __ add(sp, sp, rscratch1);
+ __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
}
- __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
-
if (NotifySimulator) {
__ notify(Assembler::method_reentry);
}
diff -r ac30fdebd5f5 -r 3852a506a19b src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon May 12 14:34:00 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon May 12 16:26:39 2014 +0100
@@ -1665,7 +1665,7 @@
// Scan bitset to accumulate register pairs
unsigned char regs[32];
- unsigned count = 0;
+ int count = 0;
for (int reg = 0; reg <= 30; reg++) {
if (1 & bitset)
regs[count++] = reg;
@@ -1674,11 +1674,16 @@
regs[count++] = zr->encoding_nocheck();
count &= ~1; // Only push an even nuber of regs
- for (int i = count - 2; i >= 0; i-= 2) {
- stp(as_Register(regs[i]), as_Register(regs[i+1]),
- Address(pre(stack, -2 * wordSize)));
+ if (count) {
+ stp(as_Register(regs[0]), as_Register(regs[1]),
+ Address(pre(stack, -count * wordSize)));
words_pushed += 2;
}
+ for (int i = 2; i < count; i += 2) {
+ stp(as_Register(regs[i]), as_Register(regs[i+1]),
+ Address(stack, i * wordSize));
+ words_pushed += 2;
+ }
return words_pushed;
}
@@ -1688,7 +1693,7 @@
// Scan bitset to accumulate register pairs
unsigned char regs[32];
- unsigned count = 0;
+ int count = 0;
for (int reg = 0; reg <= 30; reg++) {
if (1 & bitset)
regs[count++] = reg;
@@ -1697,10 +1702,19 @@
regs[count++] = zr->encoding_nocheck();
count &= ~1;
- for (unsigned i = 0; i < count; i+= 2) {
- ldp(as_Register(regs[i]), as_Register(regs[i+1]),
- Address(post(stack, 2 * wordSize)));
- words_pushed += 2;
+ if (count <= 4) {
+ for (int i = 0; i < count; i+= 2) {
+ ldp(as_Register(regs[i]), as_Register(regs[i+1]),
+ Address(post(stack, 2 * wordSize)));
+ words_pushed += 2;
+ }
+ } else {
+ for (int i = 0; i < count; i+= 2) {
+ ldp(as_Register(regs[i]), as_Register(regs[i+1]),
+ Address(stack, i * wordSize));
+ words_pushed += 2;
+ }
+ add(stack, stack, words_pushed * wordSize);
}
return words_pushed;
More information about the aarch64-port-dev
mailing list