[aarch64-port-dev ] [PATCH] 8217561 : X86: Add floating-point Math.min/max intrinsics, approval request

Fri Mar 1 16:39:03 UTC 2019

On Fri, 1 Mar 2019 at 10:44, Andrew Dinn <adinn at redhat.com> wrote:
>
> If there is some other way to avoid the slowdown on x86 (whether that
> comes with use of the intrinsic or with use of reduction) without
> clobbering the gains to be had on AArch64 then that would be preferable.

Then, I'd like to suggest another alternative here under.

This is an optimization of the API generated code using only one
'ucomis[s/d]' vs five before.
There's between 5-10% gain with the current benchmark [1] for
predictable, unpredictable and reduction scenarios.

Before:

Benchmark                      Mode  Cnt     Score   Error  Units
FpMinMaxIntrinsics.dMax        avgt       8633.782          ns/op
FpMinMaxIntrinsics.dMin        avgt       8694.123          ns/op
FpMinMaxIntrinsics.dMinReduce  avgt        710.493          ns/op
FpMinMaxIntrinsics.fMax        avgt       8578.784          ns/op
FpMinMaxIntrinsics.fMin        avgt       8734.432          ns/op
FpMinMaxIntrinsics.fMinReduce  avgt        719.532          ns/op


After:

Benchmark                      Mode  Cnt     Score   Error  Units
FpMinMaxIntrinsics.dMax        avgt       8050.014          ns/op
FpMinMaxIntrinsics.dMin        avgt       8027.534          ns/op
FpMinMaxIntrinsics.dMinReduce  avgt        675.791          ns/op
FpMinMaxIntrinsics.fMax        avgt       8022.847          ns/op
FpMinMaxIntrinsics.fMin        avgt       7945.885          ns/op
FpMinMaxIntrinsics.fMinReduce  avgt        659.173          ns/op


I haven't observed any regression until now, so statistics aren't
necessary any more.
There's no need to deal with legacy 'xmm' registers and only one
temporary integer register is necessary.

Any feedback would be more than welcome (hotspot:tier1 is OK on x86_64 xeon).

Thanks,
Bernard

[1] http://hg.openjdk.java.net/jdk/submit/rev/ab2b1418f0db

diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -808,6 +808,57 @@
   __ bind(done);
 }

+// fp min      # max
+// ucomis[s/d]
+// ja -> b     # a
+// jp -> NaN   # NaN
+// je -> a | b # a & b
+// jb -> a     # b
+void emit_fp_min_max(MacroAssembler& _masm, XMMRegister dst,
XMMRegister a, XMMRegister b, Register tmp, bool min, bool single) {
+  Label nan, equal, above, done;
+
+  if (single)
+    __ ucomiss(a, b);
+  else
+    __ ucomisd(a, b);
+
+  __ jccb(Assembler::above, above); // CF=0 & ZF=0
+  __ jccb(Assembler::parity, nan);  // PF=1
+  __ jccb(Assembler::equal, equal); // ZF=1
+
+  // below
+  if (single)
+    __ movflt(dst, min ? a : b);
+  else
+    __ movdbl(dst, min ? a : b);
+  __ jmp(done);
+
+  __ bind(nan);
+  if (single) {
+    __ movl(tmp, 0x7fc00000); // Float.NaN
+    __ movdl(dst, tmp);
+  }
+  else {
+    __ mov64(tmp, 0x7ff8000000000000L); // Double.NaN
+    __ movdq(dst, tmp);
+  }
+  __ jmp(done);
+
+  __ bind(equal);
+  if (min)
+    __ vpor(dst, a, b, Assembler::AVX_128bit);
+  else
+    __ vpand(dst, a, b, Assembler::AVX_128bit);
+  __ jmp(done);
+
+  __ bind(above);
+  if (single)
+    __ movflt(dst, min ? b : a);
+  else
+    __ movdbl(dst, min ? b : a);
+
+  __ bind(done);
+}

 //=============================================================================
 const RegMask& MachConstantBaseNode::_out_RegMask = RegMask::Empty;
@@ -5470,6 +5521,63 @@
   ins_pipe( fpu_reg_reg );
 %}

+// max = java.lang.Math.max(float a, float b)
+instruct maxF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MaxF a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = max($a, $b)\t# intrinsic (float)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    false /*min*/, true /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// max = java.lang.Math.max(double a, double b)
+instruct maxD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MaxD a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = max($a, $b)\t# intrinsic (double)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    false /*min*/, false /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
+// min = java.lang.Math.min(float a, float b)
+instruct minF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MinF a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = min($a, $b)\t# intrinsic (float)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    true /*min*/, true /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// min = java.lang.Math.min(double a, double b)
+instruct minD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MinD a b));
+  effect(USE a, USE b, TEMP tmp);
+
+  format %{ "$dst = min($a, $b)\t# intrinsic (double)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+                    true /*min*/, false /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{