[aarch64-port-dev ] [PATCH] 8217561 : X86: Add floating-point Math.min/max intrinsics, approval request
B. Blaser
bsrbnd at gmail.com
Fri Mar 1 16:39:03 UTC 2019
On Fri, 1 Mar 2019 at 10:44, Andrew Dinn <adinn at redhat.com> wrote:
>
> If there is some other way to avoid the slowdown on x86 (whether that
> comes with use of the intrinsic or with use of reduction) without
> clobbering the gains to be had on AArch64 then that would be preferable.
Then, I'd like to suggest another alternative here under.
This is an optimization of the API generated code using only one
'ucomis[s/d]' vs five before.
There's between 5-10% gain with the current benchmark [1] for
predictable, unpredictable and reduction scenarios.
Before:
Benchmark Mode Cnt Score Error Units
FpMinMaxIntrinsics.dMax avgt 8633.782 ns/op
FpMinMaxIntrinsics.dMin avgt 8694.123 ns/op
FpMinMaxIntrinsics.dMinReduce avgt 710.493 ns/op
FpMinMaxIntrinsics.fMax avgt 8578.784 ns/op
FpMinMaxIntrinsics.fMin avgt 8734.432 ns/op
FpMinMaxIntrinsics.fMinReduce avgt 719.532 ns/op
After:
Benchmark Mode Cnt Score Error Units
FpMinMaxIntrinsics.dMax avgt 8050.014 ns/op
FpMinMaxIntrinsics.dMin avgt 8027.534 ns/op
FpMinMaxIntrinsics.dMinReduce avgt 675.791 ns/op
FpMinMaxIntrinsics.fMax avgt 8022.847 ns/op
FpMinMaxIntrinsics.fMin avgt 7945.885 ns/op
FpMinMaxIntrinsics.fMinReduce avgt 659.173 ns/op
I haven't observed any regression until now, so statistics aren't
necessary any more.
There's no need to deal with legacy 'xmm' registers and only one
temporary integer register is necessary.
Any feedback would be more than welcome (hotspot:tier1 is OK on x86_64 xeon).
Thanks,
Bernard
[1] http://hg.openjdk.java.net/jdk/submit/rev/ab2b1418f0db
diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -808,6 +808,57 @@
__ bind(done);
}
+// fp min # max
+// ucomis[s/d]
+// ja -> b # a
+// jp -> NaN # NaN
+// je -> a | b # a & b
+// jb -> a # b
+void emit_fp_min_max(MacroAssembler& _masm, XMMRegister dst,
XMMRegister a, XMMRegister b, Register tmp, bool min, bool single) {
+ Label nan, equal, above, done;
+
+ if (single)
+ __ ucomiss(a, b);
+ else
+ __ ucomisd(a, b);
+
+ __ jccb(Assembler::above, above); // CF=0 & ZF=0
+ __ jccb(Assembler::parity, nan); // PF=1
+ __ jccb(Assembler::equal, equal); // ZF=1
+
+ // below
+ if (single)
+ __ movflt(dst, min ? a : b);
+ else
+ __ movdbl(dst, min ? a : b);
+ __ jmp(done);
+
+ __ bind(nan);
+ if (single) {
+ __ movl(tmp, 0x7fc00000); // Float.NaN
+ __ movdl(dst, tmp);
+ }
+ else {
+ __ mov64(tmp, 0x7ff8000000000000L); // Double.NaN
+ __ movdq(dst, tmp);
+ }
+ __ jmp(done);
+
+ __ bind(equal);
+ if (min)
+ __ vpor(dst, a, b, Assembler::AVX_128bit);
+ else
+ __ vpand(dst, a, b, Assembler::AVX_128bit);
+ __ jmp(done);
+
+ __ bind(above);
+ if (single)
+ __ movflt(dst, min ? b : a);
+ else
+ __ movdbl(dst, min ? b : a);
+
+ __ bind(done);
+}
//=============================================================================
const RegMask& MachConstantBaseNode::_out_RegMask = RegMask::Empty;
@@ -5470,6 +5521,63 @@
ins_pipe( fpu_reg_reg );
%}
+// max = java.lang.Math.max(float a, float b)
+instruct maxF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MaxF a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = max($a, $b)\t# intrinsic (float)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ false /*min*/, true /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// max = java.lang.Math.max(double a, double b)
+instruct maxD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MaxD a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = max($a, $b)\t# intrinsic (double)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ false /*min*/, false /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+
+// min = java.lang.Math.min(float a, float b)
+instruct minF_reg(regF dst, regF a, regF b, rRegI tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MinF a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = min($a, $b)\t# intrinsic (float)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ true /*min*/, true /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// min = java.lang.Math.min(double a, double b)
+instruct minD_reg(regD dst, regD a, regD b, rRegL tmp) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MinD a b));
+ effect(USE a, USE b, TEMP tmp);
+
+ format %{ "$dst = min($a, $b)\t# intrinsic (double)" %}
+ ins_encode %{
+ emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister,
$b$$XMMRegister, $tmp$$Register,
+ true /*min*/, false /*single*/);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
// Load Effective Address
instruct leaP8(rRegP dst, indOffset8 mem)
%{
More information about the hotspot-compiler-dev
mailing list