[aarch64-port-dev ] RFR: 8171410: aarch64: long multiplyExact shifts by 31 instead of 63

Mon Dec 19 10:09:22 UTC 2016

Hi,

Please review the following webrev

http://cr.openjdk.java.net/~enevill/8171410/webrev

(changeset is included inline below because of issues with cr.openjdk.java.net)

Bug report here

https://bugs.openjdk.java.net/browse/JDK-8171410

The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63

  0x000003ff64b29aa8: mul       x8, x19, x20
  0x000003ff64b29aac: smulh     x9, x19, x20
  0x000003ff64b29ab0: cmp       x9, x8, asr #31  <<<<<<< should be 63
  0x000003ff64b29ab4: b.ne      0x000003ff64b29b4c

The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.

This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.

The effect is a degenerate dis-improvement in performance. Using the following test case

class Mul {
  static long multest(long a, long b) {
	long res = a;
	for (int i = 0; i < 100000000; i++) {
		res += Math.multiplyExact(a, b);
		a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
	}
	return res;
  }
  public static void main(String argv[]) {
        long a = 0x5a5a5a5aL;
	long b = 0xa5a5a5a5L;
	System.out.println("res " + a + ", " + b + " = " + multest(a, b));
  }
}

With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S

With the following webrev

http://cr.openjdk.java.net/~enevill/8171410/webrev

-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes  0.95 S

--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
#      Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent  66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph

diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@
 
   format %{ "mul   rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
-            "cmp   rscratch2, rscratch1, ASR #31\n\t"
+            "cmp   rscratch2, rscratch1, ASR #63\n\t"
             "movw  rscratch1, #0x80000000\n\t"
             "cselw rscratch1, rscratch1, zr, NE\n\t"
             "cmpw  rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
   ins_encode %{
     __ mul(rscratch1, $op1$$Register, $op2$$Register);   // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
-    __ cmp(rscratch2, rscratch1, Assembler::ASR, 31);    // Top is pure sign ext
+    __ cmp(rscratch2, rscratch1, Assembler::ASR, 63);    // Top is pure sign ext
     __ movw(rscratch1, 0x80000000);                    // Develop 0 (EQ),
     __ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
     __ cmpw(rscratch1, 1);                             // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@
 
   format %{ "mul   rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
-            "cmp   rscratch2, rscratch1, ASR #31\n\t"
+            "cmp   rscratch2, rscratch1, ASR #63\n\t"
             "b$cmp $labl" %}
   ins_cost(4 * INSN_COST); // Branch is rare so treat as INSN_COST
   ins_encode %{
@@ -14120,7 +14120,7 @@
     Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
     __ mul(rscratch1, $op1$$Register, $op2$$Register);   // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
-    __ cmp(rscratch2, rscratch1, Assembler::ASR, 31);    // Top is pure sign ext
+    __ cmp(rscratch2, rscratch1, Assembler::ASR, 63);    // Top is pure sign ext
     __ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
   %}
 
--- CUT HERE ---

The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63

  0x000003ff64b29aa8: mul       x8, x19, x20
  0x000003ff64b29aac: smulh     x9, x19, x20
  0x000003ff64b29ab0: cmp       x9, x8, asr #31  <<<<<<< should be 63
  0x000003ff64b29ab4: b.ne      0x000003ff64b29b4c

The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.

This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.

The effect is a degenerate dis-improvement in performance. Usingthe following test case

class Mul {
  static long multest(long a, long b) {
	long res = a;
	for (int i = 0; i < 100000000; i++) {
		res += Math.multiplyExact(a, b);
		a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
	}
	return res;
  }
  public static void main(String argv[]) {
        long a = 0x5a5a5a5aL;
	long b = 0xa5a5a5a5L;
	System.out.println("res " + a + ", " + b + " = " + multest(a, b));
  }
}

With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S

With the following webrev

http://cr.openjdk.java.net/~enevill/8171410/webrev

-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes  0.95 S

--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
#      Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent  66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph

diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@
 
   format %{ "mul   rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
-            "cmp   rscratch2, rscratch1, ASR #31\n\t"
+            "cmp   rscratch2, rscratch1, ASR #63\n\t"
             "movw  rscratch1, #0x80000000\n\t"
             "cselw rscratch1, rscratch1, zr, NE\n\t"
             "cmpw  rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
   ins_encode %{
     __ mul(rscratch1, $op1$$Register, $op2$$Register);   // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
-    __ cmp(rscratch2, rscratch1, Assembler::ASR, 31);    // Top is pure sign ext
+    __ cmp(rscratch2, rscratch1, Assembler::ASR, 63);    // Top is pure sign ext
     __ movw(rscratch1, 0x80000000);                    // Develop 0 (EQ),
     __ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
     __ cmpw(rscratch1, 1);                             // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@
 
   format %{ "mul   rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
-            "cmp   rscratch2, rscratch1, ASR #31\n\t"
+            "cmp   rscratch2, rscratch1, ASR #63\n\t"
             "b$cmp $labl" %}
   ins_cost(4 * INSN_COST); // Branch is rare so treat as INSN_COST
   ins_encode %{
@@ -14120,7 +14120,7 @@
     Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
     __ mul(rscratch1, $op1$$Register, $op2$$Register);   // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
-    __ cmp(rscratch2, rscratch1, Assembler::ASR, 31);    // Top is pure sign ext
+    __ cmp(rscratch2, rscratch1, Assembler::ASR, 63);    // Top is pure sign ext
     __ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
   %}
 
--- CUT HERE ---