/hg/icedtea8-forest/hotspot: 6 new changesets

andrew at icedtea.classpath.org andrew at icedtea.classpath.org
Tue Oct 17 02:41:42 UTC 2017


changeset 92f0dbe76a13 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=92f0dbe76a13
author: mdoerr
date: Wed Oct 11 20:39:39 2017 +0100

	8145913, PR3466, RH1498309: PPC64: add Montgomery multiply intrinsic
	Reviewed-by: aph, goetz


changeset 9b9d9e11c04d in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=9b9d9e11c04d
author: goetz
date: Thu Oct 27 12:22:28 2016 +0200

	8168318, PR3466, RH1498320: PPC64: Use cmpldi instead of li/cmpld
	Reviewed-by: goetz
	Contributed-by: igor.nunes at eldorado.org.br


changeset 3c499a0ba92b in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=3c499a0ba92b
author: mdoerr
date: Fri Nov 25 11:15:12 2016 -0200

	8170328, PR3466, RH1498321: PPC64: Use andis instead of lis/and
	Reviewed-by: goetz, mdoerr
	Contributed-by: Igor Nunes <igor.nunes at eldorado.org.br>


changeset 5c00d5cd7677 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=5c00d5cd7677
author: mdoerr
date: Wed Oct 11 21:04:26 2017 +0100

	8181810, PR3466, RH1498319: PPC64: Leverage extrdi for bitfield extract
	Reviewed-by: mdoerr, simonis
	Contributed-by: Matthew Brandyberry <mbrandy at linux.vnet.ibm.com>


changeset 302eb515bf52 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=302eb515bf52
author: dbuck
date: Fri Aug 11 23:51:07 2017 -0400

	8185164, PR3438: GetOwnedMonitorInfo() returns incorrect owned monitor
	Summary: The GetOwnedMonitorInfo() should not return a pending monitor
	Reviewed-by: dcubed


changeset 542f4e30fdff in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=542f4e30fdff
author: roland
date: Wed Sep 27 16:17:47 2017 +0200

	8187822, PR8187822, RH1494230: C2 conditonal move optimization might create broken graph
	Reviewed-by: kvn


diffstat:

 src/cpu/ppc/vm/assembler_ppc.hpp                 |    2 +
 src/cpu/ppc/vm/assembler_ppc.inline.hpp          |    2 +
 src/cpu/ppc/vm/c2_init_ppc.cpp                   |    6 +
 src/cpu/ppc/vm/ppc.ad                            |   64 +++++-
 src/cpu/ppc/vm/sharedRuntime_ppc.cpp             |  244 +++++++++++++++++++++++
 src/cpu/ppc/vm/stubGenerator_ppc.cpp             |    8 +
 src/cpu/ppc/vm/templateInterpreter_ppc.cpp       |    8 +-
 src/cpu/ppc/vm/vm_version_ppc.cpp                |    7 +
 src/share/vm/opto/library_call.cpp               |   41 ++-
 src/share/vm/opto/loopopts.cpp                   |   24 +-
 src/share/vm/opto/runtime.cpp                    |   20 +-
 src/share/vm/runtime/objectMonitor.cpp           |    4 +-
 test/compiler/loopopts/TestCMovSplitThruPhi.java |   67 ++++++
 13 files changed, 469 insertions(+), 28 deletions(-)

diffs (truncated from 699 to 500 lines):

diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/assembler_ppc.hpp
--- a/src/cpu/ppc/vm/assembler_ppc.hpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Wed Sep 27 16:17:47 2017 +0200
@@ -1179,6 +1179,8 @@
   inline void mullw_( Register d, Register a, Register b);
   inline void mulhw(  Register d, Register a, Register b);
   inline void mulhw_( Register d, Register a, Register b);
+  inline void mulhwu( Register d, Register a, Register b);
+  inline void mulhwu_(Register d, Register a, Register b);
   inline void mulhd(  Register d, Register a, Register b);
   inline void mulhd_( Register d, Register a, Register b);
   inline void mulhdu( Register d, Register a, Register b);
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/assembler_ppc.inline.hpp
--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Wed Sep 27 16:17:47 2017 +0200
@@ -109,6 +109,8 @@
 inline void Assembler::mullw_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE  | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
 inline void Assembler::mulhw(  Register d, Register a, Register b) { emit_int32(MULHW_OPCODE  | rt(d) | ra(a) | rb(b) | rc(0)); }
 inline void Assembler::mulhw_( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE  | rt(d) | ra(a) | rb(b) | rc(1)); }
+inline void Assembler::mulhwu( Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
+inline void Assembler::mulhwu_(Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
 inline void Assembler::mulhd(  Register d, Register a, Register b) { emit_int32(MULHD_OPCODE  | rt(d) | ra(a) | rb(b) | rc(0)); }
 inline void Assembler::mulhd_( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE  | rt(d) | ra(a) | rb(b) | rc(1)); }
 inline void Assembler::mulhdu( Register d, Register a, Register b) { emit_int32(MULHDU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/c2_init_ppc.cpp
--- a/src/cpu/ppc/vm/c2_init_ppc.cpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/c2_init_ppc.cpp	Wed Sep 27 16:17:47 2017 +0200
@@ -45,4 +45,10 @@
       FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true);
     }
   }
+
+  if (OptimizeFill) {
+    warning("OptimizeFill is not supported on this CPU.");
+    FLAG_SET_DEFAULT(OptimizeFill, false);
+  }
+
 }
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/ppc.ad
--- a/src/cpu/ppc/vm/ppc.ad	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/ppc.ad	Wed Sep 27 16:17:47 2017 +0200
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
-// Copyright 2012, 2014 SAP AG. All rights reserved.
+// Copyright (c) 2012, 2017 SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -8610,6 +8610,44 @@
   ins_pipe(pipe_class_default);
 %}
 
+// Bitfield Extract: URShiftI + AndI
+instruct andI_urShiftI_regI_immI_immIpow2minus1(iRegIdst dst, iRegIsrc src1, immI src2, immIpow2minus1 src3) %{
+  match(Set dst (AndI (URShiftI src1 src2) src3));
+
+  format %{ "EXTRDI  $dst, $src1, shift=$src2, mask=$src3 \t// int bitfield extract" %}
+  size(4);
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_rldicl);
+    int rshift = ($src2$$constant) & 0x1f;
+    int length = log2_long(((jlong) $src3$$constant) + 1);
+    if (rshift + length > 32) {
+      // if necessary, adjust mask to omit rotated bits.
+      length = 32 - rshift;
+    }
+    __ extrdi($dst$$Register, $src1$$Register, length, 64 - (rshift + length));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// Bitfield Extract: URShiftL + AndL
+instruct andL_urShiftL_regL_immI_immLpow2minus1(iRegLdst dst, iRegLsrc src1, immI src2, immLpow2minus1 src3) %{
+  match(Set dst (AndL (URShiftL src1 src2) src3));
+
+  format %{ "EXTRDI  $dst, $src1, shift=$src2, mask=$src3 \t// long bitfield extract" %}
+  size(4);
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_rldicl);
+    int rshift  = ($src2$$constant) & 0x3f;
+    int length = log2_long(((jlong) $src3$$constant) + 1);
+    if (rshift + length > 64) {
+      // if necessary, adjust mask to omit rotated bits.
+      length = 64 - rshift;
+    }
+    __ extrdi($dst$$Register, $src1$$Register, length, 64 - (rshift + length));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct sxtI_reg(iRegIdst dst, iRegIsrc src) %{
   match(Set dst (ConvL2I (ConvI2L src)));
 
@@ -8889,6 +8927,19 @@
   ins_pipe(pipe_class_default);
 %}
 
+// Left shifted Immediate And
+instruct andI_reg_immIhi16(iRegIdst dst, iRegIsrc src1, immIhi16  src2, flagsRegCR0 cr0) %{
+  match(Set dst (AndI src1 src2));
+  effect(KILL cr0);
+  format %{ "ANDIS   $dst, $src1, $src2.hi" %}
+  size(4);
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_andis_);
+    __ andis_($dst$$Register, $src1$$Register, (int)((unsigned short)(($src2$$constant & 0xFFFF0000) >> 16)));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // Immediate And
 instruct andI_reg_uimm16(iRegIdst dst, iRegIsrc src1, uimmI16 src2, flagsRegCR0 cr0) %{
   match(Set dst (AndI src1 src2));
@@ -10571,6 +10622,17 @@
   ins_pipe(pipe_class_compare);
 %}
 
+instruct cmpP_reg_null(flagsReg crx, iRegP_N2P src1, immP_0or1 src2) %{
+  match(Set crx (CmpP src1 src2));
+  format %{ "CMPLDI   $crx, $src1, $src2 \t// ptr" %}
+  size(4);
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_cmpl);
+    __ cmpldi($crx$$CondRegister, $src1$$Register, (int)((short)($src2$$constant & 0xFFFF)));
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
 // Used in postalloc expand.
 instruct cmpP_reg_imm16(flagsReg crx, iRegPsrc src1, immL16 src2) %{
   // This match rule prevents reordering of node before a safepoint.
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/sharedRuntime_ppc.cpp
--- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Wed Sep 27 16:17:47 2017 +0200
@@ -42,6 +42,8 @@
 #include "opto/runtime.hpp"
 #endif
 
+#include <alloca.h>
+
 #define __ masm->
 
 #ifdef PRODUCT
@@ -3269,3 +3271,245 @@
   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
                                        oop_maps, true);
 }
+
+
+//------------------------------Montgomery multiplication------------------------
+//
+
+// Subtract 0:b from carry:a. Return carry.
+static unsigned long
+sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
+  long i = 0;
+  unsigned long tmp, tmp2;
+  __asm__ __volatile__ (
+    "subfc  %[tmp], %[tmp], %[tmp]   \n" // pre-set CA
+    "mtctr  %[len]                   \n"
+    "0:                              \n"
+    "ldx    %[tmp], %[i], %[a]       \n"
+    "ldx    %[tmp2], %[i], %[b]      \n"
+    "subfe  %[tmp], %[tmp2], %[tmp]  \n" // subtract extended
+    "stdx   %[tmp], %[i], %[a]       \n"
+    "addi   %[i], %[i], 8            \n"
+    "bdnz   0b                       \n"
+    "addme  %[tmp], %[carry]         \n" // carry + CA - 1
+    : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
+    : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
+    : "ctr", "xer", "memory"
+  );
+  return tmp;
+}
+
+// Multiply (unsigned) Long A by Long B, accumulating the double-
+// length result into the accumulator formed of T0, T1, and T2.
+inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
+  unsigned long hi, lo;
+  __asm__ __volatile__ (
+    "mulld  %[lo], %[A], %[B]    \n"
+    "mulhdu %[hi], %[A], %[B]    \n"
+    "addc   %[T0], %[T0], %[lo]  \n"
+    "adde   %[T1], %[T1], %[hi]  \n"
+    "addze  %[T2], %[T2]         \n"
+    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
+    : [A]"r"(A), [B]"r"(B)
+    : "xer"
+  );
+}
+
+// As above, but add twice the double-length result into the
+// accumulator.
+inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
+  unsigned long hi, lo;
+  __asm__ __volatile__ (
+    "mulld  %[lo], %[A], %[B]    \n"
+    "mulhdu %[hi], %[A], %[B]    \n"
+    "addc   %[T0], %[T0], %[lo]  \n"
+    "adde   %[T1], %[T1], %[hi]  \n"
+    "addze  %[T2], %[T2]         \n"
+    "addc   %[T0], %[T0], %[lo]  \n"
+    "adde   %[T1], %[T1], %[hi]  \n"
+    "addze  %[T2], %[T2]         \n"
+    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
+    : [A]"r"(A), [B]"r"(B)
+    : "xer"
+  );
+}
+
+// Fast Montgomery multiplication. The derivation of the algorithm is
+// in "A Cryptographic Library for the Motorola DSP56000,
+// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
+static void
+montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
+                    unsigned long m[], unsigned long inv, int len) {
+  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+  int i;
+
+  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+  for (i = 0; i < len; i++) {
+    int j;
+    for (j = 0; j < i; j++) {
+      MACC(a[j], b[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    MACC(a[i], b[0], t0, t1, t2);
+    m[i] = t0 * inv;
+    MACC(m[i], n[0], t0, t1, t2);
+
+    assert(t0 == 0, "broken Montgomery multiply");
+
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  for (i = len; i < 2*len; i++) {
+    int j;
+    for (j = i-len+1; j < len; j++) {
+      MACC(a[j], b[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i-len] = t0;
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  while (t0) {
+    t0 = sub(m, n, t0, len);
+  }
+}
+
+// Fast Montgomery squaring. This uses asymptotically 25% fewer
+// multiplies so it should be up to 25% faster than Montgomery
+// multiplication. However, its loop control is more complex and it
+// may actually run slower on some machines.
+static void
+montgomery_square(unsigned long a[], unsigned long n[],
+                  unsigned long m[], unsigned long inv, int len) {
+  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+  int i;
+
+  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+  for (i = 0; i < len; i++) {
+    int j;
+    int end = (i+1)/2;
+    for (j = 0; j < end; j++) {
+      MACC2(a[j], a[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    if ((i & 1) == 0) {
+      MACC(a[j], a[j], t0, t1, t2);
+    }
+    for (; j < i; j++) {
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i] = t0 * inv;
+    MACC(m[i], n[0], t0, t1, t2);
+
+    assert(t0 == 0, "broken Montgomery square");
+
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  for (i = len; i < 2*len; i++) {
+    int start = i-len+1;
+    int end = start + (len - start)/2;
+    int j;
+    for (j = start; j < end; j++) {
+      MACC2(a[j], a[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    if ((i & 1) == 0) {
+      MACC(a[j], a[j], t0, t1, t2);
+    }
+    for (; j < len; j++) {
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i-len] = t0;
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  while (t0) {
+    t0 = sub(m, n, t0, len);
+  }
+}
+
+// The threshold at which squaring is advantageous was determined
+// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
+// Doesn't seem to be relevant for Power8 so we use the same value.
+#define MONTGOMERY_SQUARING_THRESHOLD 64
+
+// Copy len longwords from s to d, word-swapping as we go. The
+// destination array is reversed.
+static void reverse_words(unsigned long *s, unsigned long *d, int len) {
+  d += len;
+  while(len-- > 0) {
+    d--;
+    unsigned long s_val = *s;
+    // Swap words in a longword on little endian machines.
+#ifdef VM_LITTLE_ENDIAN
+     s_val = (s_val << 32) | (s_val >> 32);
+#endif
+    *d = s_val;
+    s++;
+  }
+}
+
+void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
+                                        jint len, jlong inv,
+                                        jint *m_ints) {
+  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
+  int longwords = len/2;
+  assert(longwords > 0, "unsupported");
+
+  // Make very sure we don't use so much space that the stack might
+  // overflow. 512 jints corresponds to an 16384-bit integer and
+  // will use here a total of 8k bytes of stack space.
+  int total_allocation = longwords * sizeof (unsigned long) * 4;
+  guarantee(total_allocation <= 8192, "must be");
+  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+
+  // Local scratch arrays
+  unsigned long
+    *a = scratch + 0 * longwords,
+    *b = scratch + 1 * longwords,
+    *n = scratch + 2 * longwords,
+    *m = scratch + 3 * longwords;
+
+  reverse_words((unsigned long *)a_ints, a, longwords);
+  reverse_words((unsigned long *)b_ints, b, longwords);
+  reverse_words((unsigned long *)n_ints, n, longwords);
+
+  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
+
+  reverse_words(m, (unsigned long *)m_ints, longwords);
+}
+
+void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
+                                      jint len, jlong inv,
+                                      jint *m_ints) {
+  assert(len % 2 == 0, "array length in montgomery_square must be even");
+  int longwords = len/2;
+  assert(longwords > 0, "unsupported");
+
+  // Make very sure we don't use so much space that the stack might
+  // overflow. 512 jints corresponds to an 16384-bit integer and
+  // will use here a total of 6k bytes of stack space.
+  int total_allocation = longwords * sizeof (unsigned long) * 3;
+  guarantee(total_allocation <= 8192, "must be");
+  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+
+  // Local scratch arrays
+  unsigned long
+    *a = scratch + 0 * longwords,
+    *n = scratch + 1 * longwords,
+    *m = scratch + 2 * longwords;
+
+  reverse_words((unsigned long *)a_ints, a, longwords);
+  reverse_words((unsigned long *)n_ints, n, longwords);
+
+  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
+    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
+  } else {
+    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
+  }
+
+  reverse_words(m, (unsigned long *)m_ints, longwords);
+}
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/stubGenerator_ppc.cpp
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Wed Sep 27 16:17:47 2017 +0200
@@ -2094,6 +2094,14 @@
     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
                                                        &StubRoutines::_safefetchN_fault_pc,
                                                        &StubRoutines::_safefetchN_continuation_pc);
+    if (UseMontgomeryMultiplyIntrinsic) {
+      StubRoutines::_montgomeryMultiply
+        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
+    }
+    if (UseMontgomerySquareIntrinsic) {
+      StubRoutines::_montgomerySquare
+        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
+    }
   }
 
  public:
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/templateInterpreter_ppc.cpp
--- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Wed Sep 27 16:17:47 2017 +0200
@@ -265,7 +265,7 @@
       __ cmpdi(CCR0, Rmdo, 0);
       __ beq(CCR0, no_mdo);
 
-      // Increment backedge counter in the MDO.
+      // Increment invocation counter in the MDO.
       const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
       __ lwz(Rscratch2, mdo_bc_offs, Rmdo);
       __ addi(Rscratch2, Rscratch2, increment);
@@ -277,12 +277,12 @@
     }
 
     // Increment counter in MethodCounters*.
-    const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
+    const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
     __ bind(no_mdo);
     __ get_method_counters(R19_method, R3_counters, done);
-    __ lwz(Rscratch2, mo_bc_offs, R3_counters);
+    __ lwz(Rscratch2, mo_ic_offs, R3_counters);
     __ addi(Rscratch2, Rscratch2, increment);
-    __ stw(Rscratch2, mo_bc_offs, R3_counters);
+    __ stw(Rscratch2, mo_ic_offs, R3_counters);
     __ load_const_optimized(Rscratch1, mask, R0);
     __ and_(Rscratch1, Rscratch2, Rscratch1);
     __ beq(CCR0, *overflow);
diff -r 542c122b1d7d -r 542f4e30fdff src/cpu/ppc/vm/vm_version_ppc.cpp
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Wed Sep 27 16:17:47 2017 +0200
@@ -181,6 +181,13 @@
   if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
     FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
   }
+
+  if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
+    UseMontgomeryMultiplyIntrinsic = true;
+  }
+  if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
+    UseMontgomerySquareIntrinsic = true;
+  }
 }
 
 void VM_Version::print_features() {
diff -r 542c122b1d7d -r 542f4e30fdff src/share/vm/opto/library_call.cpp
--- a/src/share/vm/opto/library_call.cpp	Mon Jul 31 04:13:32 2017 +0100
+++ b/src/share/vm/opto/library_call.cpp	Wed Sep 27 16:17:47 2017 +0200
@@ -6031,11 +6031,21 @@
     Node* n_start = array_element_address(n, intcon(0), n_elem);
     Node* m_start = array_element_address(m, intcon(0), m_elem);
 
-    Node* call = make_runtime_call(RC_LEAF,
-                                   OptoRuntime::montgomeryMultiply_Type(),
-                                   stubAddr, stubName, TypePtr::BOTTOM,
-                                   a_start, b_start, n_start, len, inv, top(),
-                                   m_start);
+    Node* call = NULL;
+    if (CCallingConventionRequiresIntsAsLongs) {
+      Node* len_I2L = ConvI2L(len);
+      call = make_runtime_call(RC_LEAF,
+                               OptoRuntime::montgomeryMultiply_Type(),
+                               stubAddr, stubName, TypePtr::BOTTOM,
+                               a_start, b_start, n_start, len_I2L XTOP, inv,
+                               top(), m_start);
+    } else {
+      call = make_runtime_call(RC_LEAF,
+                               OptoRuntime::montgomeryMultiply_Type(),
+                               stubAddr, stubName, TypePtr::BOTTOM,
+                               a_start, b_start, n_start, len, inv, top(),
+                               m_start);
+    }
     set_result(m);
   }
 
@@ -6085,11 +6095,22 @@
     Node* n_start = array_element_address(n, intcon(0), n_elem);
     Node* m_start = array_element_address(m, intcon(0), m_elem);
 
-    Node* call = make_runtime_call(RC_LEAF,
-                                   OptoRuntime::montgomerySquare_Type(),
-                                   stubAddr, stubName, TypePtr::BOTTOM,
-                                   a_start, n_start, len, inv, top(),
-                                   m_start);
+    Node* call = NULL;
+    if (CCallingConventionRequiresIntsAsLongs) {
+      Node* len_I2L = ConvI2L(len);
+      call = make_runtime_call(RC_LEAF,
+                               OptoRuntime::montgomerySquare_Type(),
+                               stubAddr, stubName, TypePtr::BOTTOM,
+                               a_start, n_start, len_I2L XTOP, inv, top(),
+                               m_start);


More information about the distro-pkg-dev mailing list