[aarch64-port-dev ] [PATCH] 8217561 : X86: Add floating-point Math.min/max intrinsics, approval request
B. Blaser
bsrbnd at gmail.com
Fri Mar 1 19:58:19 UTC 2019
On Fri, 1 Mar 2019 at 17:39, B. Blaser <bsrbnd at gmail.com> wrote:
>
> On Fri, 1 Mar 2019 at 10:44, Andrew Dinn <adinn at redhat.com> wrote:
> >
> > If there is some other way to avoid the slowdown on x86 (whether that
> > comes with use of the intrinsic or with use of reduction) without
> > clobbering the gains to be had on AArch64 then that would be preferable.
>
> Then, I'd like to suggest another alternative here under.
>
> This is an optimization of the API generated code using only one
> 'ucomis[s/d]' vs five before.
> There's between 5-10% gain with the current benchmark [1] for
> predictable, unpredictable and reduction scenarios.
>
> Before:
>
> Benchmark Mode Cnt Score Error Units
> FpMinMaxIntrinsics.dMax avgt 8633.782 ns/op
> FpMinMaxIntrinsics.dMin avgt 8694.123 ns/op
> FpMinMaxIntrinsics.dMinReduce avgt 710.493 ns/op
> FpMinMaxIntrinsics.fMax avgt 8578.784 ns/op
> FpMinMaxIntrinsics.fMin avgt 8734.432 ns/op
> FpMinMaxIntrinsics.fMinReduce avgt 719.532 ns/op
>
>
> After:
>
> Benchmark Mode Cnt Score Error Units
> FpMinMaxIntrinsics.dMax avgt 8050.014 ns/op
> FpMinMaxIntrinsics.dMin avgt 8027.534 ns/op
> FpMinMaxIntrinsics.dMinReduce avgt 675.791 ns/op
> FpMinMaxIntrinsics.fMax avgt 8022.847 ns/op
> FpMinMaxIntrinsics.fMin avgt 7945.885 ns/op
> FpMinMaxIntrinsics.fMinReduce avgt 659.173 ns/op
>
>
> I haven't observed any regression until now, so statistics aren't
> necessary any more.
> There's no need to deal with legacy 'xmm' registers and only one
> temporary integer register is necessary.
>
> Any feedback would be more than welcome (hotspot:tier1 is OK on x86_64 xeon).
Small correction below as equivalent fp values might have different
representations, see JLS §4.2.3.
The gain is still roughly 5-10%.
Bernard
diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -808,6 +808,81 @@
__ bind(done);
}
+// Math.min() # Math.max()
+// --------------------------
+// ucomis[s/d] #
+// ja -> b # a
+// jp -> NaN # NaN
+// jb -> a # b
+// je #
+// |-jz -> a | b # a & b
+// | -> a #
+void emit_fp_min_max(MacroAssembler& _masm, XMMRegister dst,
+ XMMRegister a, XMMRegister b, Register tmp,
+ bool min, bool single) {
+
+ Label nan, zero, below, above, done;
+
+ if (single)
+ __ ucomiss(a, b);
+ else
+ __ ucomisd(a, b);
+
+ __ jccb(Assembler::above, above); // CF=0 & ZF=0
+ __ jccb(Assembler::parity, nan); // PF=1
+ __ jccb(Assembler::below, below); // CF=1
+
+ // equal
+ if (single) {
+ __ movdl(tmp, a);
+ __ shll(tmp, 1); // skip sign bit
+ __ testl(tmp, tmp);
+ __ jccb(Assembler::zero, zero);
+ __ movflt(dst, a);
+ __ jmp(done);
+ }
+ else {
+ __ movdq(tmp, a);
+ __ shlq(tmp, 1); // skip sign bit
+ __ testq(tmp, tmp);
+ __ jccb(Assembler::zero, zero);
+ __ movdbl(dst, a);
+ __ jmp(done);
+ }
+
+ __ bind(zero);
+ if (min)
+ __ vpor(dst, a, b, Assembler::AVX_128bit);
+ else
+ __ vpand(dst, a, b, Assembler::AVX_128bit);
+ __ jmp(done);
+
+ __ bind(above);
+ if (single)
+ __ movflt(dst, min ? b : a);
+ else
+ __ movdbl(dst, min ? b : a);
+ __ jmp(done);
+
+ __ bind(nan);
+ if (single) {
+ __ movl(tmp, 0x7fc00000); // Float.NaN
+ __ movdl(dst, tmp);
+ }
+ else {
+ __ mov64(tmp, 0x7ff8000000000000L); // Double.NaN
+ __ movdq(dst, tmp);
+ }
+ __ jmp(done);
+
+ __ bind(below);
+ if (single)
+ __ movflt(dst, min ? a : b);
+ else
+ __ movdbl(dst, min ? a : b);
+
+ __ bind(done);
+}
//=============================================================================
const RegMask& MachConstantBaseNode::_out_RegMask = RegMask::Empty;
@@ -5470,6 +5545,63 @@
ins_pipe( fpu_reg_reg );
%}
+// max = java.lang.Math.max(float a, float b)
+instruct maxF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MaxF a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = max($a, $b)\t# intrinsic (float)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ false /*min*/, true /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// max = java.lang.Math.max(double a, double b)
+instruct maxD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MaxD a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = max($a, $b)\t# intrinsic (double)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ false /*min*/, false /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+
+// min = java.lang.Math.min(float a, float b)
+instruct minF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MinF a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = min($a, $b)\t# intrinsic (float)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ true /*min*/, true /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// min = java.lang.Math.min(double a, double b)
+instruct minD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MinD a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = min($a, $b)\t# intrinsic (double)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ true /*min*/, false /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
// Load Effective Address
instruct leaP8(rRegP dst, indOffset8 mem)
%{
More information about the hotspot-compiler-dev
mailing list