RFR: aarch64: add support for vectorizing floating-point fabs & fneg
Felix Yang
felix.yang at linaro.org
Thu Sep 24 13:22:44 UTC 2015
Hi,
I would like to contribute support for vectorizing fabs & fneg
instructions on aarch64 architecture.
The patch takes a similar way as adding support for vectorizing double
precision sqrt on Intel architectures. And two new tests are added.
The performance gain for the "sumReductionImplement" function is: 30%
for single precision and 10% for double precision with LoopMaxUnroll=16.
I have tested this with JTreg hotspot+langtools+jdk with no
regressions. Is it OK?
As this patch involves changes to shared code, I also need an Oracle
sponsor for it if approved.
Thanks,
Felix
PATCH:
diff -r 66d90f141fd8 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad Tue Sep 22 13:42:09 2015 +0200
+++ b/src/cpu/aarch64/vm/aarch64.ad Wed Sep 23 12:27:37 2015 +0800
@@ -15190,6 +15190,88 @@
ins_pipe(pipe_class_default);
%}
+// --------------------------------- ABS
--------------------------------------
+
+instruct vabs2F(vecD dst, vecD src)
+%{
+ predicate(n->as_Vector()->length() == 2);
+ match(Set dst (AbsVF src));
+ ins_cost(INSN_COST * 3);
+ format %{ "fabs $dst,$src\t# vector (2S)" %}
+ ins_encode %{
+ __ fabs(as_FloatRegister($dst$$reg), __ T2S,
+ as_FloatRegister($src$$reg));
+ %}
+ ins_pipe(pipe_class_default);
+%}
+
+instruct vabs4F(vecX dst, vecX src)
+%{
+ predicate(n->as_Vector()->length() == 4);
+ match(Set dst (AbsVF src));
+ ins_cost(INSN_COST * 3);
+ format %{ "fabs $dst,$src\t# vector (4S)" %}
+ ins_encode %{
+ __ fabs(as_FloatRegister($dst$$reg), __ T4S,
+ as_FloatRegister($src$$reg));
+ %}
+ ins_pipe(pipe_class_default);
+%}
+
+instruct vabs2D(vecX dst, vecX src)
+%{
+ predicate(n->as_Vector()->length() == 2);
+ match(Set dst (AbsVD src));
+ ins_cost(INSN_COST * 3);
+ format %{ "fabs $dst,$src\t# vector (2D)" %}
+ ins_encode %{
+ __ fabs(as_FloatRegister($dst$$reg), __ T2D,
+ as_FloatRegister($src$$reg));
+ %}
+ ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- NEG
--------------------------------------
+
+instruct vneg2F(vecD dst, vecD src)
+%{
+ predicate(n->as_Vector()->length() == 2);
+ match(Set dst (NegVF src));
+ ins_cost(INSN_COST * 3);
+ format %{ "fneg $dst,$src\t# vector (2S)" %}
+ ins_encode %{
+ __ fneg(as_FloatRegister($dst$$reg), __ T2S,
+ as_FloatRegister($src$$reg));
+ %}
+ ins_pipe(pipe_class_default);
+%}
+
+instruct vneg4F(vecX dst, vecX src)
+%{
+ predicate(n->as_Vector()->length() == 4);
+ match(Set dst (NegVF src));
+ ins_cost(INSN_COST * 3);
+ format %{ "fneg $dst,$src\t# vector (4S)" %}
+ ins_encode %{
+ __ fneg(as_FloatRegister($dst$$reg), __ T4S,
+ as_FloatRegister($src$$reg));
+ %}
+ ins_pipe(pipe_class_default);
+%}
+
+instruct vneg2D(vecX dst, vecX src)
+%{
+ predicate(n->as_Vector()->length() == 2);
+ match(Set dst (NegVD src));
+ ins_cost(INSN_COST * 3);
+ format %{ "fneg $dst,$src\t# vector (2D)" %}
+ ins_encode %{
+ __ fneg(as_FloatRegister($dst$$reg), __ T2D,
+ as_FloatRegister($src$$reg));
+ %}
+ ins_pipe(pipe_class_default);
+%}
+
// --------------------------------- AND
--------------------------------------
instruct vand8B(vecD dst, vecD src1, vecD src2)
diff -r 66d90f141fd8 src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp Tue Sep 22 13:42:09 2015
+0200
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp Wed Sep 23 12:27:37 2015
+0800
@@ -2106,6 +2106,19 @@
#undef INSN
+#define INSN(NAME, opc, opc2) \
+ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
\
+ starti;
\
+ assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
\
+ f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24), f(1, 23);
\
+ f(T==T2D ? 1:0, 22), f(opc2, 21, 10), rf(Vn, 5), rf(Vd, 0);
\
+ }
+
+ INSN(fabs, 0, 0b100000111110);
+ INSN(fneg, 1, 0b100000111110);
+
+#undef INSN
+
#define INSN(NAME, opc)
\
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn,
FloatRegister Vm) { \
starti;
\
diff -r 66d90f141fd8 src/share/vm/adlc/formssel.cpp
--- a/src/share/vm/adlc/formssel.cpp Tue Sep 22 13:42:09 2015 +0200
+++ b/src/share/vm/adlc/formssel.cpp Wed Sep 23 12:27:37 2015 +0800
@@ -4143,6 +4143,8 @@
"SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
"MulVS","MulVI","MulVL","MulVF","MulVD",
"DivVF","DivVD",
+ "AbsVF","AbsVD",
+ "NegVF","NegVD",
"SqrtVD",
"AndV" ,"XorV" ,"OrV",
"AddReductionVI", "AddReductionVL",
diff -r 66d90f141fd8 src/share/vm/opto/classes.hpp
--- a/src/share/vm/opto/classes.hpp Tue Sep 22 13:42:09 2015 +0200
+++ b/src/share/vm/opto/classes.hpp Wed Sep 23 12:27:37 2015 +0800
@@ -290,6 +290,10 @@
macro(MulReductionVD)
macro(DivVF)
macro(DivVD)
+macro(AbsVF)
+macro(AbsVD)
+macro(NegVF)
+macro(NegVD)
macro(SqrtVD)
macro(LShiftCntV)
macro(RShiftCntV)
diff -r 66d90f141fd8 src/share/vm/opto/superword.cpp
--- a/src/share/vm/opto/superword.cpp Tue Sep 22 13:42:09 2015 +0200
+++ b/src/share/vm/opto/superword.cpp Wed Sep 23 12:27:37 2015 +0800
@@ -1858,8 +1858,8 @@
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
}
- } else if (opc == Op_SqrtD) {
- // Promote operand to vector (Sqrt is a 2 address instruction)
+ } else if (opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD ||
opc == Op_NegF || opc == Op_NegD) {
+ // Promote operand to vector (Sqrt/Abs/Neg are 2 address
instructions)
Node* in = vector_opd(p, 1);
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
diff -r 66d90f141fd8 src/share/vm/opto/vectornode.cpp
--- a/src/share/vm/opto/vectornode.cpp Tue Sep 22 13:42:09 2015 +0200
+++ b/src/share/vm/opto/vectornode.cpp Wed Sep 23 12:27:37 2015 +0800
@@ -92,6 +92,18 @@
case Op_DivD:
assert(bt == T_DOUBLE, "must be");
return Op_DivVD;
+ case Op_AbsF:
+ assert(bt == T_FLOAT, "must be");
+ return Op_AbsVF;
+ case Op_AbsD:
+ assert(bt == T_DOUBLE, "must be");
+ return Op_AbsVD;
+ case Op_NegF:
+ assert(bt == T_FLOAT, "must be");
+ return Op_NegVF;
+ case Op_NegD:
+ assert(bt == T_DOUBLE, "must be");
+ return Op_NegVD;
case Op_SqrtD:
assert(bt == T_DOUBLE, "must be");
return Op_SqrtVD;
@@ -280,6 +292,12 @@
case Op_DivVF: return new DivVFNode(n1, n2, vt);
case Op_DivVD: return new DivVDNode(n1, n2, vt);
+ case Op_AbsVF: return new AbsVFNode(n1, vt);
+ case Op_AbsVD: return new AbsVDNode(n1, vt);
+
+ case Op_NegVF: return new NegVFNode(n1, vt);
+ case Op_NegVD: return new NegVDNode(n1, vt);
+
// Currently only supports double precision sqrt
case Op_SqrtVD: return new SqrtVDNode(n1, vt);
diff -r 66d90f141fd8 src/share/vm/opto/vectornode.hpp
--- a/src/share/vm/opto/vectornode.hpp Tue Sep 22 13:42:09 2015 +0200
+++ b/src/share/vm/opto/vectornode.hpp Wed Sep 23 12:27:37 2015 +0800
@@ -309,6 +309,38 @@
virtual int Opcode() const;
};
+//------------------------------AbsVFNode--------------------------------------
+// Vector Abs float
+class AbsVFNode : public VectorNode {
+ public:
+ AbsVFNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+ virtual int Opcode() const;
+};
+
+//------------------------------AbsVDNode--------------------------------------
+// Vector Abs double
+class AbsVDNode : public VectorNode {
+ public:
+ AbsVDNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+ virtual int Opcode() const;
+};
+
+//------------------------------NegVFNode--------------------------------------
+// Vector Neg float
+class NegVFNode : public VectorNode {
+ public:
+ NegVFNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+ virtual int Opcode() const;
+};
+
+//------------------------------NegVDNode--------------------------------------
+// Vector Neg double
+class NegVDNode : public VectorNode {
+ public:
+ NegVDNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+ virtual int Opcode() const;
+};
+
//------------------------------SqrtVDNode--------------------------------------
// Vector Sqrt double
class SqrtVDNode : public VectorNode {
diff -r 66d90f141fd8
test/compiler/loopopts/superword/SumRedAbsNeg_Double.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/loopopts/superword/SumRedAbsNeg_Double.java Wed Sep 23
12:27:37 2015 +0800
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @summary Add C2 AArch64 Superword support for scalar sum reduction
optimizations : double abs & neg test
+ * @requires os.arch=="aarch64"
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Double
+ */
+
+public class SumRedAbsNeg_Double
+{
+ public static void main(String[] args) throws Exception {
+ double[] a = new double[256*1024];
+ double[] b = new double[256*1024];
+ double[] c = new double[256*1024];
+ double[] d = new double[256*1024];
+ sumReductionInit(a,b,c);
+ double total = 0;
+ double valid = 3.6028590866691944E19;
+
+ for(int j = 0; j < 2000; j++) {
+ total = sumReductionImplement(a,b,c,d,total);
+ }
+
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " +
total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void sumReductionInit(
+ double[] a,
+ double[] b,
+ double[] c)
+ {
+ for(int j = 0; j < 1; j++)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i * 1 + j;
+ b[i] = i * 1 - j;
+ c[i] = i + j;
+ }
+ }
+ }
+
+ public static double sumReductionImplement(
+ double[] a,
+ double[] b,
+ double[] c,
+ double[] d,
+ double total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ d[i] = Math.abs(-a[i] * -b[i]) + Math.abs(-a[i] * -c[i]) +
Math.abs(-b[i] * -c[i]);
+ total += d[i];
+ }
+ return total;
+ }
+
+}
diff -r 66d90f141fd8
test/compiler/loopopts/superword/SumRedAbsNeg_Float.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/loopopts/superword/SumRedAbsNeg_Float.java Wed Sep 23
12:27:37 2015 +0800
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @summary Add C2 AArch64 Superword support for scalar sum reduction
optimizations : float abs & neg test
+ * @requires os.arch=="aarch64"
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
-XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16
-XX:CompileThresholdScaling=0.1 SumRedAbsNeg_Float
+ */
+
+public class SumRedAbsNeg_Float
+{
+ public static void main(String[] args) throws Exception {
+ float[] a = new float[256*1024];
+ float[] b = new float[256*1024];
+ float[] c = new float[256*1024];
+ float[] d = new float[256*1024];
+ sumReductionInit(a,b,c);
+ float total = 0;
+ float valid = (float)4.611686E18;
+
+ for(int j = 0; j < 2000; j++) {
+ total = sumReductionImplement(a,b,c,d,total);
+ }
+
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " +
total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void sumReductionInit(
+ float[] a,
+ float[] b,
+ float[] c)
+ {
+ for(int j = 0; j < 1; j++)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i * 1 + j;
+ b[i] = i * 1 - j;
+ c[i] = i + j;
+ }
+ }
+ }
+
+ public static float sumReductionImplement(
+ float[] a,
+ float[] b,
+ float[] c,
+ float[] d,
+ float total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ d[i] = Math.abs(-a[i] * -b[i]) + Math.abs(-a[i] * -c[i]) +
Math.abs(-b[i] * -c[i]);
+ total += d[i];
+ }
+ return total;
+ }
+
+}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.openjdk.java.net/pipermail/hotspot-compiler-dev/attachments/20150924/2a215796/attachment-0001.html>
More information about the hotspot-compiler-dev
mailing list