[aarch64-port-dev ] C1 Patch: fix tableswitch

Andrew Haley aph at redhat.com
Mon Jul 22 05:17:36 PDT 2013


Running some tests reveals that we are generating execrable code for
tableswitch:

 36: tableswitch low=0, high=255, default=6
         0: 1076
         1: 1423
         2: 1606
         ...

[2052]   0x00007fffee58fb7c: cmp	w4, #0x0
[2052]   ;;  106 branch [EQ] [B4]
  0x00007fffee58fb80: b.eq	0x00007fffee5a0cd0
[2052]   0x00007fffee58fb84: cmp	w4, #0x1
[2052]   ;;  110 branch [EQ] [B5]
  0x00007fffee58fb88: b.eq	0x00007fffee5a0bd4
[2052]   0x00007fffee58fb8c: cmp	w4, #0x2
[2052]   ;;  114 branch [EQ] [B6]
  0x00007fffee58fb90: b.eq	0x00007fffee5a0acc
... ad nauseam

It turns out that there is no really straightforward fix for this
because there is no representation of tableswitch in the IR.  So, I
have written a peephole that runs after all of the data-flow and
control-flow analysis and turns this into:

  0x00007fffee22e120: adr	xscratch1, 0x00007fffee22e138
  0x00007fffee22e124: sub	wscratch2, w0, #0x0
  0x00007fffee22e128: cmp	wscratch2, #0x100
  0x00007fffee22e12c: b.cs	0x00007fffee22e538
  0x00007fffee22e130: add	xscratch1, xscratch1, wscratch2, sxtw #2
  0x00007fffee22e134: br	xscratch1
  ;;      branch [AL] [label:0xbc06fb28]
  0x00007fffee22e138: b	0x00007fffee231e78
  ;;      branch [AL] [label:0xbc06fd38]
  0x00007fffee22e13c: b	0x00007fffee231e3c
  ;;      branch [AL] [label:0xbc06ff48]
  0x00007fffee22e140: b	0x00007fffee231df4
  ;;      branch [AL] [label:0xbc070178]
  0x00007fffee22e144: b	0x00007fffee231db0
  ;;      branch [AL] [label:0xbc070388]
  0x00007fffee22e148: b	0x00007fffee231d7c

Some curmudgeons might argue that this kind of thing shouldn't be
done in C1, but I would reply that this peephole is of the same complexity
as that of the delay slot recognizer on SPARC, and this code crops up a lot
in OpenJDK.

We can disable this when we get C2.

Andrew.


# HG changeset patch
# User aph
# Date 1374323576 -3600
# Node ID 2b48247088e4cdd3a0d176345054e8a853770b05
# Parent  88c01e23c81857f61ce87daad4864dd12c0474db
C1: Optimized tableswitch

diff -r 88c01e23c818 -r 2b48247088e4 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Fri Jul 19 12:41:02 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Sat Jul 20 13:32:56 2013 +0100
@@ -1810,7 +1810,12 @@


 void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
-  if (opr1->is_single_cpu() || opr1->is_double_cpu()) {
+  if (opr1->is_constant() && opr2->is_single_cpu()) {
+    // tableswitch
+    Register reg = as_reg(opr2);
+    struct tableswitch &table = switches[opr1->as_constant_ptr()->as_jint()];
+    __ tableswitch(reg, table._first_key, table._last_key, table._branches, table._after);
+  } else if (opr1->is_single_cpu() || opr1->is_double_cpu()) {
     Register reg1 = as_reg(opr1);
     if (opr2->is_single_cpu()) {
       // cpu register - cpu register
@@ -2799,7 +2804,131 @@
 void LIR_Assembler::get_thread(LIR_Opr result_reg) { Unimplemented(); }


-void LIR_Assembler::peephole(LIR_List*) {
+void LIR_Assembler::peephole(LIR_List *lir) {
+  if (tableswitch_count >= max_tableswitches)
+    return;
+
+  /*
+    This finite-state automaton recognizes sequences of compare-and-
+    branch instructions.  We will turn them into a tableswitch.  You
+    could argue that C1 really shouldn't be doing this sort of
+    optimization, but without it the code is really horrible.
+  */
+
+  enum { start_s, cmp1_s, beq_s, cmp_s } state;
+  int first_key, last_key = -2147483648;
+  int next_key = 0;
+  int start_insn = -1;
+  int last_insn = -1;
+  Register reg = noreg;
+  LIR_Opr reg_opr;
+  state = start_s;
+
+  LIR_OpList* inst = lir->instructions_list();
+  for (int i = 0; i < inst->length(); i++) {
+    LIR_Op* op = inst->at(i);
+    switch (state) {
+    case start_s:
+      first_key = -1;
+      start_insn = i;
+      switch (op->code()) {
+      case lir_cmp:
+	LIR_Opr opr1 = op->as_Op2()->in_opr1();
+	LIR_Opr opr2 = op->as_Op2()->in_opr2();
+	if (opr1->is_cpu_register() && opr1->is_single_cpu()
+	    && opr2->is_constant()
+	    && opr2->type() == T_INT) {
+	  reg_opr = opr1;
+	  reg = opr1->as_register();
+	  first_key = opr2->as_constant_ptr()->as_jint();
+	  next_key = first_key + 1;
+	  state = cmp_s;
+	  goto next_state;
+	}
+	break;
+      }
+      break;
+    case cmp_s:
+      switch (op->code()) {
+      case lir_branch:
+	if (op->as_OpBranch()->cond() == lir_cond_equal) {
+	  state = beq_s;
+	  last_insn = i;
+	  goto next_state;
+	}
+      }
+      state = start_s;
+      break;
+    case beq_s:
+      switch (op->code()) {
+      case lir_cmp: {
+	LIR_Opr opr1 = op->as_Op2()->in_opr1();
+	LIR_Opr opr2 = op->as_Op2()->in_opr2();
+	if (opr1->is_cpu_register() && opr1->is_single_cpu()
+	    && opr1->as_register() == reg
+	    && opr2->is_constant()
+	    && opr2->type() == T_INT
+	    && opr2->as_constant_ptr()->as_jint() == next_key) {
+	  last_key = next_key;
+	  next_key++;
+	  state = cmp_s;
+	  goto next_state;
+	}
+      }
+      }
+      last_key = next_key;
+      state = start_s;
+      break;
+    default:
+      assert(false, "impossible state");
+    }
+    if (state == start_s) {
+      if (first_key < last_key - 5L && reg != noreg) {
+	{
+	  // printf("found run register %d starting at insn %d low value %d high value %d\n",
+	  //        reg->encoding(),
+	  //        start_insn, first_key, last_key);
+	  //   for (int i = 0; i < inst->length(); i++) {
+	  //     inst->at(i)->print();
+	  //     tty->print("\n");
+	  //   }
+	  //   tty->print("\n");
+	}
+
+	struct tableswitch *sw = &switches[tableswitch_count];
+	sw->_insn_index = start_insn, sw->_first_key = first_key,
+	  sw->_last_key = last_key, sw->_reg = reg;
+	inst->insert_before(last_insn + 1, new LIR_OpLabel(&sw->_after));
+	{
+	  // Insert the new table of branches
+	  int offset = last_insn;
+	  for (int n = first_key; n < last_key; n++) {
+	    inst->insert_before
+	      (last_insn + 1,
+	       new LIR_OpBranch(lir_cond_always, T_ILLEGAL,
+				inst->at(offset)->as_OpBranch()->label()));
+	    offset -= 2, i++;
+	  }
+	}
+	// Delete all the old compare-and-branch instructions
+	for (int n = first_key; n < last_key; n++) {
+	  inst->remove_at(start_insn);
+	  inst->remove_at(start_insn);
+	}
+	// Insert the tableswitch instruction
+	inst->insert_before(start_insn,
+			    new LIR_Op2(lir_cmp, lir_cond_always,
+					LIR_OprFact::intConst(tableswitch_count),
+					reg_opr));
+	inst->insert_before(start_insn + 1, new LIR_OpLabel(&sw->_branches));
+	tableswitch_count++;
+      }
+      reg = noreg;
+      last_key = -2147483648;
+    }
+  next_state:
+    ;
+  }
 }

 void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) { Unimplemented(); }
diff -r 88c01e23c818 -r 2b48247088e4 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp	Fri Jul 19 12:41:02 2013 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.hpp	Sat Jul 20 13:32:56 2013 +0100
@@ -58,6 +58,12 @@

   void poll_for_safepoint(relocInfo::relocType rtype, CodeEmitInfo* info = NULL);

+  static const int max_tableswitches = 20;
+  struct tableswitch switches[max_tableswitches];
+  int tableswitch_count;
+
+  void init() { tableswitch_count = 0; }
+
 public:

   void store_parameter(Register r, int offset_from_esp_in_words);
diff -r 88c01e23c818 -r 2b48247088e4 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri Jul 19 12:41:02 2013 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Sat Jul 20 13:32:56 2013 +0100
@@ -1218,6 +1218,16 @@
   void repne_scanw(Register addr, Register value, Register count,
 		   Register scratch);

+  void tableswitch(Register index, jint lowbound, jint highbound,
+		   Label &jumptable, Label &jumptable_end) {
+    adr(rscratch1, jumptable);
+    subw(rscratch2, index, lowbound);
+    cmpw(rscratch2, highbound - lowbound);
+    br(Assembler::HS, jumptable_end);
+    add(rscratch1, rscratch1, rscratch2, ext::sxtw, 2);
+    br(rscratch1);
+  }
+
   // Prolog generator routines to support switch between x86 code and
   // generated ARM code

@@ -1306,4 +1316,11 @@
    ~SkipIfEqual();
 };

+struct tableswitch {
+  Register _reg;
+  int _insn_index; jint _first_key; jint _last_key;
+  Label _after;
+  Label _branches;
+};
+
 #endif // CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_HPP
diff -r 88c01e23c818 -r 2b48247088e4 src/share/vm/c1/c1_LIRAssembler.cpp
--- a/src/share/vm/c1/c1_LIRAssembler.cpp	Fri Jul 19 12:41:02 2013 +0100
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp	Sat Jul 20 13:32:56 2013 +0100
@@ -117,6 +117,9 @@
  , _pending_non_safepoint_offset(0)
 {
   _slow_case_stubs = new CodeStubList();
+#ifdef TARGET_ARCH_aarch64
+  init(); // Target-dependent initialization
+#endif
 }





More information about the aarch64-port-dev mailing list