[aarch64-port-dev ] RFR: Add support for encode_iso_array

Mon Aug 4 17:15:04 UTC 2014

Hi,

The following patch adds support for encode_iso_array.

I have kept the patch simple to keep it small and optimised it for the case that there are no top byte set characters.

One problem is that I could not see a way of persuading C2 register allocater to allocate the 4 temp V regs needed as Vn, Vn+1, VN+2, Vn+3 as is required for the ld1 {...}, [..] instruction.

Instead I have simply bound them to V0-V3. I do not think this will actually make any difference to performance as the outer Java loop (encodeArrayLoop) has no FP usage anyway.

Benchmarking it shows between 1.8x and 2.1x performance improvement.

All the best,
Ed.

--- CUT HERE ---
# HG changeset patch
# User Edward Nevill edward.nevill at linaro.org
# Date 1407171833 -3600
#      Mon Aug 04 18:03:53 2014 +0100
# Node ID 1a507fdf6de65e387fe15c382a31f3353878b820
# Parent  91a0340a6eb3b0d0bc992b4b989455145cdd8eb9
Add encode_iso_array intrinsic

diff -r 91a0340a6eb3 -r 1a507fdf6de6 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon Aug 04 12:00:05 2014 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Aug 04 18:03:53 2014 +0100
@@ -381,6 +381,9 @@
 // Singleton class for R2 int register
 reg_class int_r2_reg(R2);
 
+// Singleton class for R3 int register
+reg_class int_r3_reg(R3);
+
 // Singleton class for R4 int register
 reg_class int_r4_reg(R4);
 
@@ -698,6 +701,26 @@
     V31, V31_H
 );
 
+// Class for 128 bit register v0
+reg_class v0_reg(
+    V0, V0_H
+);
+
+// Class for 128 bit register v1
+reg_class v1_reg(
+    V1, V1_H
+);
+
+// Class for 128 bit register v2
+reg_class v2_reg(
+    V2, V2_H
+);
+
+// Class for 128 bit register v3
+reg_class v3_reg(
+    V3, V3_H
+);
+
 // Singleton class for condition codes
 reg_class int_flags(RFLAGS);
 
@@ -4116,6 +4139,18 @@
   interface(REG_INTER);
 %}
 
+// Register R3 only
+operand iRegI_R3()
+%{
+  constraint(ALLOC_IN_RC(int_r3_reg));
+  match(RegI);
+  match(iRegINoSp);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+
 // Register R2 only
 operand iRegI_R4()
 %{
@@ -4185,6 +4220,42 @@
   interface(REG_INTER);
 %}
 
+operand vRegD_V0()
+%{
+  constraint(ALLOC_IN_RC(v0_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V1()
+%{
+  constraint(ALLOC_IN_RC(v1_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V2()
+%{
+  constraint(ALLOC_IN_RC(v2_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V3()
+%{
+  constraint(ALLOC_IN_RC(v3_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Flags register, used as output of signed compare instructions
 
 // note that on AArch64 we also use this register as the output for
@@ -11691,6 +11762,25 @@
   ins_pipe(pipe_class_memory);
 %}
 
+// encode char[] to byte[] in ISO_8859_1
+instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
+                          vRegD_V0 Vtmp1, vRegD_V1 Vtmp2,
+                          vRegD_V2 Vtmp3, vRegD_V3 Vtmp4,
+                          iRegI_R0 result, rFlagsReg cr)
+%{
+  match(Set result (EncodeISOArray src (Binary dst len)));
+  effect(USE_KILL src, USE_KILL dst, USE_KILL len,
+         KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr);
+
+  format %{ "Encode array $src,$dst,$len -> $result" %}
+  ins_encode %{
+    __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
+         $result$$Register, $Vtmp1$$FloatRegister,  $Vtmp2$$FloatRegister,
+         $Vtmp3$$FloatRegister,  $Vtmp4$$FloatRegister);
+  %}
+  ins_pipe( pipe_class_memory );
+%}
+
 // ============================================================================
 // This name is KNOWN by the ADLC and cannot be changed.
 // The ADLC forces a 'TypeRawPtr::BOTTOM' output type
diff -r 91a0340a6eb3 -r 1a507fdf6de6 src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Aug 04 12:00:05 2014 -0400
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Aug 04 18:03:53 2014 +0100
@@ -2073,6 +2073,15 @@
     pmull(Vd, Ta, Vn, Vm, Tb);
   }
 
+  void uqxtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) {
+    starti;
+    int size_b = (int)Tb >> 1;
+    int size_a = (int)Ta >> 1;
+    assert(size_b < 3 && size_b == size_a - 1, "Invalid size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b101110, 29, 24), f(size_b, 23, 22);
+    f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
   void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
   {
     starti;
diff -r 91a0340a6eb3 -r 1a507fdf6de6 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Aug 04 12:00:05 2014 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Aug 04 18:03:53 2014 +0100
@@ -3613,3 +3613,64 @@
   
   BLOCK_COMMENT("} char_arrays_equals");
 }
+
+// encode char[] to byte[] in ISO_8859_1
+void MacroAssembler::encode_iso_array(Register src, Register dst,
+		      Register len, Register result,
+		      FloatRegister Vtmp1, FloatRegister Vtmp2,
+                      FloatRegister Vtmp3, FloatRegister Vtmp4)
+{
+    Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
+    Register tmp1 = rscratch1;
+
+      mov(result, len);	// Save initial len
+
+      subs(len, len, 32);
+      br(LT, LOOP_8);
+
+// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
+// to convert chars to bytes. These set the 'QC' bit in the FPSR if
+// any char could not fit in a byte, so clear the FPSR so we can test it.
+      clear_fpsr();
+
+    BIND(NEXT_32);
+      ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+      uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
+      uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
+      uqxtn(Vtmp2, T8B, Vtmp3, T8H);
+      uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
+      get_fpsr(tmp1);
+      cbnzw(tmp1, LOOP_8);
+      st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
+      subs(len, len, 32);
+      add(src, src, 64);
+      br(GE, NEXT_32);
+
+    BIND(LOOP_8);
+      adds(len, len, 32-8);
+      br(LT, LOOP_1);
+      clear_fpsr(); // QC may be set from loop above, clear again
+    BIND(NEXT_8);
+      ld1(Vtmp1, T8H, src);
+      uqxtn(Vtmp1, T8B, Vtmp1, T8H);
+      get_fpsr(tmp1);
+      cbnzw(tmp1, LOOP_1);
+      st1(Vtmp1, T8B, post(dst, 8));
+      subs(len, len, 8);
+      add(src, src, 16);
+      br(GE, NEXT_8);
+
+    BIND(LOOP_1);
+      adds(len, len, 8);
+      br(LE, DONE);
+    BIND(NEXT_1);
+      ldrh(tmp1, Address(post(src, 2)));
+      tst(tmp1, 0xff00);
+      br(NE, DONE);
+      strb(tmp1, Address(post(dst, 1)));
+      subs(len, len, 1);
+      br(GT, NEXT_1);
+
+    BIND(DONE);
+      sub(result, result, len); // Return index where we stopped
+}
diff -r 91a0340a6eb3 -r 1a507fdf6de6 src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Aug 04 12:00:05 2014 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Aug 04 18:03:53 2014 +0100
@@ -1087,6 +1087,10 @@
 		     Register tmp1);
   void char_arrays_equals(Register ary1, Register ary2,
                           Register result, Register tmp1);
+  void encode_iso_array(Register src, Register dst,
+                        Register len, Register result,
+                        FloatRegister Vtmp1, FloatRegister Vtmp2,
+                        FloatRegister Vtmp3, FloatRegister Vtmp4);
 };
 
 // Used by aarch64.ad to control code generation
--- CUT HERE ---