[aarch64-port-dev ] HasNegatives

Stuart Monteith stuart.monteith at linaro.org
Wed Jul 19 22:57:31 UTC 2017


Hello,
  Please find below my latest version of the has negatives implementation.
It passes JTregs, and the HasNegatives regression test looks adequate
for finding errors.
I understand the Bellsoft guys have a patch in the works too - so
it'll be best to compare their version once it is uploaded, and we can
perhaps compare how they run.

The results I get in JMH are:

Platform A:

Benchmark                       (length)  Mode  Cnt       Score     Error  Units
HasNegatives.loopingFastMethod         4  avgt   50    3823.895 ±   0.264  ns/op
HasNegatives.loopingFastMethod        31  avgt   50    7977.361 ± 141.724  ns/op
HasNegatives.loopingFastMethod        65  avgt   50   12303.588 ± 100.645  ns/op
HasNegatives.loopingFastMethod       101  avgt   50   14464.835 ± 126.982  ns/op
HasNegatives.loopingFastMethod       256  avgt   50   38142.723 ±   3.266  ns/op
HasNegatives.steamFastMethod           4  avgt   50    6208.206 ±   0.401  ns/op
HasNegatives.steamFastMethod          31  avgt   50   23370.868 ±   1.337  ns/op
HasNegatives.steamFastMethod          65  avgt   50   52450.499 ±   6.449  ns/op
HasNegatives.steamFastMethod         101  avgt   50   61013.218 ±  73.249  ns/op
HasNegatives.steamFastMethod         256  avgt   50  159738.530 ±  12.301  ns/op

Platform B:

Benchmark                       (length)  Mode  Cnt       Score
Error  Units
HasNegatives.loopingFastMethod         4  avgt   50    6509.262 ±
0.058  ns/op
HasNegatives.loopingFastMethod        31  avgt   50   19014.390 ±
0.209  ns/op
HasNegatives.loopingFastMethod        65  avgt   50   26918.376 ±
150.078  ns/op
HasNegatives.loopingFastMethod       101  avgt   50   33021.181 ±
0.306  ns/op
HasNegatives.loopingFastMethod       256  avgt   50   58033.081 ±
0.349  ns/op
HasNegatives.steamFastMethod           4  avgt   50   11511.223 ±
0.142  ns/op
HasNegatives.steamFastMethod          31  avgt   50   56736.881 ±
419.531  ns/op
HasNegatives.steamFastMethod          65  avgt   50   77044.199 ±
0.720  ns/op
HasNegatives.steamFastMethod         101  avgt   50  119524.822 ±
1042.359  ns/op
HasNegatives.steamFastMethod         256  avgt   50  276267.518 ±
2.961  ns/op

Platform C:

Benchmark                       (length)  Mode  Cnt       Score     Error  Units
HasNegatives.loopingFastMethod         4  avgt   50    5013.519 ±   0.229  ns/op
HasNegatives.loopingFastMethod        31  avgt   50    9186.669 ±   0.482  ns/op
HasNegatives.loopingFastMethod        65  avgt   50   12938.151 ±   0.558  ns/op
HasNegatives.loopingFastMethod       101  avgt   50   15689.530 ± 101.980  ns/op
HasNegatives.loopingFastMethod       256  avgt   50   26903.405 ± 104.391  ns/op
HasNegatives.steamFastMethod           4  avgt   50    9185.878 ±   0.326  ns/op
HasNegatives.steamFastMethod          31  avgt   50   45037.950 ±   1.993  ns/op
HasNegatives.steamFastMethod          65  avgt   50   87140.886 ±   4.777  ns/op
HasNegatives.steamFastMethod         101  avgt   50  119742.041 ±  71.852  ns/op
HasNegatives.steamFastMethod         256  avgt   50  291104.970 ±  18.563  ns/op

The patch:


# HG changeset patch
# User smonteith
# Date 1499083520 -3600
#      Mon Jul 03 13:05:20 2017 +0100
# Node ID f138dbd8e5da70b4b4a1b384d6cae7954ab91f38
# Parent  38018e6e25b3180591c01d80c623f60c17009a9e
[mq]: has_neg

diff -r 38018e6e25b3 -r f138dbd8e5da src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad Thu Jul 13 01:28:24 2017 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad Mon Jul 03 13:05:20 2017 +0100
@@ -15747,6 +15747,20 @@
   ins_pipe(pipe_class_memory);
 %}

+// fast detection of a negative byte in byte[]
+instruct has_negatives(iRegP_R1 array, iRegI_R2 len, iRegI_R0 result,
iRegI tmp1,
+  iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (HasNegatives array len));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL array,
USE_KILL len, KILL cr);
+
+  format %{ "has negatives byte[] $array,$len -> $result    // KILL
$array, $len" %}
+  ins_encode %{
+    __ has_negatives($array$$Register, $len$$Register, $result$$Register,
+        $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}

 // fast char[] to byte[] compression
 instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
diff -r 38018e6e25b3 -r f138dbd8e5da
src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Jul 13
01:28:24 2017 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon Jul 03
13:05:20 2017 +0100
@@ -4829,6 +4829,129 @@
   BLOCK_COMMENT("} string_compare");
 }

+// java.lang.StringCoding.hasNegatives intrinsic.
+// Returns 1 in "result" when one or more bytes in "array" of length "len" has
+// its top bit set.
+void MacroAssembler::has_negatives(Register array, Register len,
Register result,
+                                   Register tmp1, Register tmp2, Register tmp3,
+                                   Register tmp4)
+{
+  // The top bits of every byte in a 64-bit constant.
+  const uint64_t UPPER_BIT_MASK=0x8080808080808080;
+  Label EXIT_REAL, EXIT_FALSE, EXIT_TRUE, LONGER, LOOP16, LAST8, TAIL;
+
+  BLOCK_COMMENT("has_negatives {");
+
+  // If there are 0 or less bytes, no negative bytes were found.
+  cmpw(len, 0);
+  br(Assembler::LE, EXIT_FALSE);
+
+  // Unaligned load of first 64-bits. Used in LONGER and for testing 8 bytes
+  // or less on fallthrough.
+  ldr(tmp1, Address(array));
+
+  cmp(len, 8);
+  br(Assembler::GT, LONGER);
+
+  // Test 8 bytes or less
+
+  // tmp1 loaded 8 bytes from the array. Shift off bytes that are past the
+  // end of the string.
+  mov(tmp2, 8);
+  sub(tmp3, tmp2, len);
+  lsl(tmp3, tmp3, 3);
+  lslv(tmp4, tmp1, tmp3);
+
+  tst(tmp4, UPPER_BIT_MASK);
+  br(Assembler::NE, EXIT_TRUE);
+  b(EXIT_FALSE);
+
+  // Test against any string over 8 bytes in length.
+  BIND(LONGER);
+
+  // Unconditionally test the first 64-bits.
+  tst(tmp1, UPPER_BIT_MASK);
+  br(Assembler::NE, EXIT_TRUE);
+
+  // Align start of array to next 8-byte aligned boundary and set the
+  // remaining length accordingly.
+  mov(tmp2, 8);
+  andw(tmp4, array, 7);
+  sub(tmp3, tmp2, tmp4);
+
+  add(array, array, tmp3);
+  sub(len, len, tmp3);
+
+  cmpw(len, 16);
+  br(Assembler::LT, LAST8);
+
+  // Align array on 16 byte boundary, if necessary.
+  tst(array, 8);
+  br(Assembler::EQ, LOOP16);
+
+  ldr(tmp1, Address(post(array, 8)));
+  tst(tmp1, UPPER_BIT_MASK);
+  br(Assembler::NE, EXIT_TRUE);
+
+  sub(len, len, 8);
+
+  cmpw(len, 16);
+  br(Assembler::LT, LAST8);
+
+  // Loop to test 16-byte chunks.
+  BIND(LOOP16);
+
+  ldp(tmp1, tmp2, Address(post(array, 16)));
+  orr(tmp3, tmp1, tmp2);
+
+  tst(tmp3, UPPER_BIT_MASK);
+  br(Assembler::NE, EXIT_TRUE);
+
+  sub(len, len, 16);
+
+  cmpw(len, 16);
+  br(Assembler::GE, LOOP16);
+
+  // Handle the last whole 8 bytes.
+  BIND(LAST8);
+
+  cmpw(len, 8);
+  br(Assembler::LE, TAIL);
+  ldr(tmp1, Address(post(array,8)));
+
+  tst(tmp1, UPPER_BIT_MASK);
+  br(Assembler::NE, EXIT_TRUE);
+
+  sub(len, len, 8);
+
+  // Handle the last bytes - may be 8 or less bytes.
+  bind(TAIL);
+
+  cbz(len, EXIT_FALSE);
+
+  // Align end of the 64-bit load with the last byte.
+  mov(tmp1, 8);
+  subw(tmp3, tmp1, len);
+  sub(array, array, tmp3);
+
+  ldr(tmp1, Address(array));
+  tst(tmp1, UPPER_BIT_MASK);
+  br(Assembler::NE, EXIT_TRUE);
+
+  BIND(EXIT_FALSE);
+  mov(result, 0);
+
+  b(EXIT_REAL);
+
+  BIND(EXIT_TRUE);
+
+  mov(result, 1);
+
+  BIND(EXIT_REAL);
+
+  BLOCK_COMMENT("} has_negatives");
+}
+
 // Compare Strings or char/byte arrays.

 // is_string is true iff this is a string comparison.
diff -r 38018e6e25b3 -r f138dbd8e5da
src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Jul 13
01:28:24 2017 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Mon Jul 03
13:05:20 2017 +0100
@@ -1209,6 +1209,9 @@
                       Register tmp1,
                       FloatRegister vtmp, FloatRegister vtmpZ, int ae);

+  void has_negatives(Register array, Register len, Register result,
+                     Register tmp1, Register tmp2, Register tmp3,
Register tmp4);
+
   void arrays_equals(Register a1, Register a2,
                      Register result, Register cnt1,
                      int elem_size, bool is_string);


More information about the aarch64-port-dev mailing list