[aarch64-port-dev ] C1 Patch: fix tableswitch
Andrew Haley
aph at redhat.com
Mon Jul 22 05:17:36 PDT 2013
Running some tests reveals that we are generating execrable code for
tableswitch:
36: tableswitch low=0, high=255, default=6
0: 1076
1: 1423
2: 1606
...
[2052] 0x00007fffee58fb7c: cmp w4, #0x0
[2052] ;; 106 branch [EQ] [B4]
0x00007fffee58fb80: b.eq 0x00007fffee5a0cd0
[2052] 0x00007fffee58fb84: cmp w4, #0x1
[2052] ;; 110 branch [EQ] [B5]
0x00007fffee58fb88: b.eq 0x00007fffee5a0bd4
[2052] 0x00007fffee58fb8c: cmp w4, #0x2
[2052] ;; 114 branch [EQ] [B6]
0x00007fffee58fb90: b.eq 0x00007fffee5a0acc
... ad nauseam
It turns out that there is no really straightforward fix for this
because there is no representation of tableswitch in the IR. So, I
have written a peephole that runs after all of the data-flow and
control-flow analysis and turns this into:
0x00007fffee22e120: adr xscratch1, 0x00007fffee22e138
0x00007fffee22e124: sub wscratch2, w0, #0x0
0x00007fffee22e128: cmp wscratch2, #0x100
0x00007fffee22e12c: b.cs 0x00007fffee22e538
0x00007fffee22e130: add xscratch1, xscratch1, wscratch2, sxtw #2
0x00007fffee22e134: br xscratch1
;; branch [AL] [label:0xbc06fb28]
0x00007fffee22e138: b 0x00007fffee231e78
;; branch [AL] [label:0xbc06fd38]
0x00007fffee22e13c: b 0x00007fffee231e3c
;; branch [AL] [label:0xbc06ff48]
0x00007fffee22e140: b 0x00007fffee231df4
;; branch [AL] [label:0xbc070178]
0x00007fffee22e144: b 0x00007fffee231db0
;; branch [AL] [label:0xbc070388]
0x00007fffee22e148: b 0x00007fffee231d7c
Some curmudgeons might argue that this kind of thing shouldn't be
done in C1, but I would reply that this peephole is of the same complexity
as that of the delay slot recognizer on SPARC, and this code crops up a lot
in OpenJDK.
We can disable this when we get C2.
Andrew.
# HG changeset patch
# User aph
# Date 1374323576 -3600
# Node ID 2b48247088e4cdd3a0d176345054e8a853770b05
# Parent 88c01e23c81857f61ce87daad4864dd12c0474db
C1: Optimized tableswitch
diff -r 88c01e23c818 -r 2b48247088e4 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp Fri Jul 19 12:41:02 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp Sat Jul 20 13:32:56 2013 +0100
@@ -1810,7 +1810,12 @@
void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
- if (opr1->is_single_cpu() || opr1->is_double_cpu()) {
+ if (opr1->is_constant() && opr2->is_single_cpu()) {
+ // tableswitch
+ Register reg = as_reg(opr2);
+ struct tableswitch &table = switches[opr1->as_constant_ptr()->as_jint()];
+ __ tableswitch(reg, table._first_key, table._last_key, table._branches, table._after);
+ } else if (opr1->is_single_cpu() || opr1->is_double_cpu()) {
Register reg1 = as_reg(opr1);
if (opr2->is_single_cpu()) {
// cpu register - cpu register
@@ -2799,7 +2804,131 @@
void LIR_Assembler::get_thread(LIR_Opr result_reg) { Unimplemented(); }
-void LIR_Assembler::peephole(LIR_List*) {
+void LIR_Assembler::peephole(LIR_List *lir) {
+ if (tableswitch_count >= max_tableswitches)
+ return;
+
+ /*
+ This finite-state automaton recognizes sequences of compare-and-
+ branch instructions. We will turn them into a tableswitch. You
+ could argue that C1 really shouldn't be doing this sort of
+ optimization, but without it the code is really horrible.
+ */
+
+ enum { start_s, cmp1_s, beq_s, cmp_s } state;
+ int first_key, last_key = -2147483648;
+ int next_key = 0;
+ int start_insn = -1;
+ int last_insn = -1;
+ Register reg = noreg;
+ LIR_Opr reg_opr;
+ state = start_s;
+
+ LIR_OpList* inst = lir->instructions_list();
+ for (int i = 0; i < inst->length(); i++) {
+ LIR_Op* op = inst->at(i);
+ switch (state) {
+ case start_s:
+ first_key = -1;
+ start_insn = i;
+ switch (op->code()) {
+ case lir_cmp:
+ LIR_Opr opr1 = op->as_Op2()->in_opr1();
+ LIR_Opr opr2 = op->as_Op2()->in_opr2();
+ if (opr1->is_cpu_register() && opr1->is_single_cpu()
+ && opr2->is_constant()
+ && opr2->type() == T_INT) {
+ reg_opr = opr1;
+ reg = opr1->as_register();
+ first_key = opr2->as_constant_ptr()->as_jint();
+ next_key = first_key + 1;
+ state = cmp_s;
+ goto next_state;
+ }
+ break;
+ }
+ break;
+ case cmp_s:
+ switch (op->code()) {
+ case lir_branch:
+ if (op->as_OpBranch()->cond() == lir_cond_equal) {
+ state = beq_s;
+ last_insn = i;
+ goto next_state;
+ }
+ }
+ state = start_s;
+ break;
+ case beq_s:
+ switch (op->code()) {
+ case lir_cmp: {
+ LIR_Opr opr1 = op->as_Op2()->in_opr1();
+ LIR_Opr opr2 = op->as_Op2()->in_opr2();
+ if (opr1->is_cpu_register() && opr1->is_single_cpu()
+ && opr1->as_register() == reg
+ && opr2->is_constant()
+ && opr2->type() == T_INT
+ && opr2->as_constant_ptr()->as_jint() == next_key) {
+ last_key = next_key;
+ next_key++;
+ state = cmp_s;
+ goto next_state;
+ }
+ }
+ }
+ last_key = next_key;
+ state = start_s;
+ break;
+ default:
+ assert(false, "impossible state");
+ }
+ if (state == start_s) {
+ if (first_key < last_key - 5L && reg != noreg) {
+ {
+ // printf("found run register %d starting at insn %d low value %d high value %d\n",
+ // reg->encoding(),
+ // start_insn, first_key, last_key);
+ // for (int i = 0; i < inst->length(); i++) {
+ // inst->at(i)->print();
+ // tty->print("\n");
+ // }
+ // tty->print("\n");
+ }
+
+ struct tableswitch *sw = &switches[tableswitch_count];
+ sw->_insn_index = start_insn, sw->_first_key = first_key,
+ sw->_last_key = last_key, sw->_reg = reg;
+ inst->insert_before(last_insn + 1, new LIR_OpLabel(&sw->_after));
+ {
+ // Insert the new table of branches
+ int offset = last_insn;
+ for (int n = first_key; n < last_key; n++) {
+ inst->insert_before
+ (last_insn + 1,
+ new LIR_OpBranch(lir_cond_always, T_ILLEGAL,
+ inst->at(offset)->as_OpBranch()->label()));
+ offset -= 2, i++;
+ }
+ }
+ // Delete all the old compare-and-branch instructions
+ for (int n = first_key; n < last_key; n++) {
+ inst->remove_at(start_insn);
+ inst->remove_at(start_insn);
+ }
+ // Insert the tableswitch instruction
+ inst->insert_before(start_insn,
+ new LIR_Op2(lir_cmp, lir_cond_always,
+ LIR_OprFact::intConst(tableswitch_count),
+ reg_opr));
+ inst->insert_before(start_insn + 1, new LIR_OpLabel(&sw->_branches));
+ tableswitch_count++;
+ }
+ reg = noreg;
+ last_key = -2147483648;
+ }
+ next_state:
+ ;
+ }
}
void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) { Unimplemented(); }
diff -r 88c01e23c818 -r 2b48247088e4 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp Fri Jul 19 12:41:02 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp Sat Jul 20 13:32:56 2013 +0100
@@ -58,6 +58,12 @@
void poll_for_safepoint(relocInfo::relocType rtype, CodeEmitInfo* info = NULL);
+ static const int max_tableswitches = 20;
+ struct tableswitch switches[max_tableswitches];
+ int tableswitch_count;
+
+ void init() { tableswitch_count = 0; }
+
public:
void store_parameter(Register r, int offset_from_esp_in_words);
diff -r 88c01e23c818 -r 2b48247088e4 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Fri Jul 19 12:41:02 2013 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Sat Jul 20 13:32:56 2013 +0100
@@ -1218,6 +1218,16 @@
void repne_scanw(Register addr, Register value, Register count,
Register scratch);
+ void tableswitch(Register index, jint lowbound, jint highbound,
+ Label &jumptable, Label &jumptable_end) {
+ adr(rscratch1, jumptable);
+ subw(rscratch2, index, lowbound);
+ cmpw(rscratch2, highbound - lowbound);
+ br(Assembler::HS, jumptable_end);
+ add(rscratch1, rscratch1, rscratch2, ext::sxtw, 2);
+ br(rscratch1);
+ }
+
// Prolog generator routines to support switch between x86 code and
// generated ARM code
@@ -1306,4 +1316,11 @@
~SkipIfEqual();
};
+struct tableswitch {
+ Register _reg;
+ int _insn_index; jint _first_key; jint _last_key;
+ Label _after;
+ Label _branches;
+};
+
#endif // CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_HPP
diff -r 88c01e23c818 -r 2b48247088e4 src/share/vm/c1/c1_LIRAssembler.cpp
--- a/src/share/vm/c1/c1_LIRAssembler.cpp Fri Jul 19 12:41:02 2013 +0100
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp Sat Jul 20 13:32:56 2013 +0100
@@ -117,6 +117,9 @@
, _pending_non_safepoint_offset(0)
{
_slow_case_stubs = new CodeStubList();
+#ifdef TARGET_ARCH_aarch64
+ init(); // Target-dependent initialization
+#endif
}
More information about the aarch64-port-dev
mailing list