[aarch64-port-dev ] [PATCH] 8217561 : X86: Add floating-point Math.min/max intrinsics, approval request

Fri Mar 1 19:58:19 UTC 2019

On Fri, 1 Mar 2019 at 17:39, B. Blaser <bsrbnd at gmail.com> wrote:
>
> On Fri, 1 Mar 2019 at 10:44, Andrew Dinn <adinn at redhat.com> wrote:
> >
> > If there is some other way to avoid the slowdown on x86 (whether that
> > comes with use of the intrinsic or with use of reduction) without
> > clobbering the gains to be had on AArch64 then that would be preferable.
>
> Then, I'd like to suggest another alternative here under.
>
> This is an optimization of the API generated code using only one
> 'ucomis[s/d]' vs five before.
> There's between 5-10% gain with the current benchmark [1] for
> predictable, unpredictable and reduction scenarios.
>
> Before:
>
> Benchmark                      Mode  Cnt     Score   Error  Units
> FpMinMaxIntrinsics.dMax        avgt       8633.782          ns/op
> FpMinMaxIntrinsics.dMin        avgt       8694.123          ns/op
> FpMinMaxIntrinsics.dMinReduce  avgt        710.493          ns/op
> FpMinMaxIntrinsics.fMax        avgt       8578.784          ns/op
> FpMinMaxIntrinsics.fMin        avgt       8734.432          ns/op
> FpMinMaxIntrinsics.fMinReduce  avgt        719.532          ns/op
>
>
> After:
>
> Benchmark                      Mode  Cnt     Score   Error  Units
> FpMinMaxIntrinsics.dMax        avgt       8050.014          ns/op
> FpMinMaxIntrinsics.dMin        avgt       8027.534          ns/op
> FpMinMaxIntrinsics.dMinReduce  avgt        675.791          ns/op
> FpMinMaxIntrinsics.fMax        avgt       8022.847          ns/op
> FpMinMaxIntrinsics.fMin        avgt       7945.885          ns/op
> FpMinMaxIntrinsics.fMinReduce  avgt        659.173          ns/op
>
>
> I haven't observed any regression until now, so statistics aren't
> necessary any more.
> There's no need to deal with legacy 'xmm' registers and only one
> temporary integer register is necessary.
>
> Any feedback would be more than welcome (hotspot:tier1 is OK on x86_64 xeon).

Small correction below as equivalent fp values might have different
representations, see JLS §4.2.3.
The gain is still roughly 5-10%.

Bernard

diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -808,6 +808,81 @@
   __ bind(done);
 }

+// Math.min()    # Math.max()
+// --------------------------
+// ucomis[s/d]   #
+// ja   -> b     # a
+// jp   -> NaN   # NaN
+// jb   -> a     # b
+// je            #
+// |-jz -> a | b # a & b
+// |    -> a     #
+void emit_fp_min_max(MacroAssembler& _masm, XMMRegister dst,
+                     XMMRegister a, XMMRegister b, Register tmp,
+                     bool min, bool single) {
+
+  Label nan, zero, below, above, done;
+
+  if (single)
+    __ ucomiss(a, b);
+  else
+    __ ucomisd(a, b);
+
+  __ jccb(Assembler::above, above); // CF=0 & ZF=0
+  __ jccb(Assembler::parity, nan);  // PF=1
+  __ jccb(Assembler::below, below); // CF=1
+
+  // equal
+  if (single) {
+    __ movdl(tmp, a);
+    __ shll(tmp, 1); // skip sign bit
+    __ testl(tmp, tmp);
+    __ jccb(Assembler::zero, zero);
+    __ movflt(dst, a);
+    __ jmp(done);
+  }
+  else {
+    __ movdq(tmp, a);
+    __ shlq(tmp, 1); // skip sign bit
+    __ testq(tmp, tmp);
+    __ jccb(Assembler::zero, zero);
+    __ movdbl(dst, a);
+    __ jmp(done);
+  }
+
+  __ bind(zero);
+  if (min)
+    __ vpor(dst, a, b, Assembler::AVX_128bit);
+  else
+    __ vpand(dst, a, b, Assembler::AVX_128bit);
+  __ jmp(done);
+
+  __ bind(above);
+  if (single)
+    __ movflt(dst, min ? b : a);
+  else
+    __ movdbl(dst, min ? b : a);
+  __ jmp(done);
+
+  __ bind(nan);
+  if (single) {
+    __ movl(tmp, 0x7fc00000); // Float.NaN
+    __ movdl(dst, tmp);
+  }
+  else {
+    __ mov64(tmp, 0x7ff8000000000000L); // Double.NaN
+    __ movdq(dst, tmp);
+  }
+  __ jmp(done);
+
+  __ bind(below);
+  if (single)
+    __ movflt(dst, min ? a : b);
+  else
+    __ movdbl(dst, min ? a : b);
+
+  __ bind(done);
+}

 //=============================================================================
 const RegMask& MachConstantBaseNode::_out_RegMask = RegMask::Empty;
@@ -5470,6 +5545,63 @@
   ins_pipe( fpu_reg_reg );
 %}

+// max = java.lang.Math.max(float a, float b)
+instruct maxF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MaxF a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = max($a, $b)\t# intrinsic (float)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    false /*min*/, true /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// max = java.lang.Math.max(double a, double b)
+instruct maxD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MaxD a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = max($a, $b)\t# intrinsic (double)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    false /*min*/, false /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
+// min = java.lang.Math.min(float a, float b)
+instruct minF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MinF a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = min($a, $b)\t# intrinsic (float)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    true /*min*/, true /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// min = java.lang.Math.min(double a, double b)
+instruct minD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MinD a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = min($a, $b)\t# intrinsic (double)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    true /*min*/, false /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{