RFR: Polymorphic Guarded Inlining in C2

Ludovic Henry luhenry at microsoft.com
Thu Feb 6 17:17:46 UTC 2020


Hello,

In our evergoing search of improving performance, I've looked at inlining and, more specifically, at polymorphic guarded inlining. Today in HotSpot, the maximum number of guards for types at any call site is two - with bimorphic guarded inlining. However, Graal and Zing have observed great results with increasing that limit.

You'll find following a patch that makes the number of guards for types configurable with the `TypeProfileWidth` global.

Testing:
Passing tier1 on Linux and Windows, plus other large applications (through the Adopt testing scripts)

Benchmarking:
To get data, we run a benchmark against Apache Pinot and observe the following results:

[cid:image001.png at 01D5D2DB.F5165550]

We observe close to 20% improvements on this sample benchmark with a morphism (=width) of 3 or 4. We are currently validating these numbers on a more extensive set of benchmarks and platforms, and I'll share them as soon as we have them.

I am happy to provide more information, just let me know if you have any question.

Thank you,

--
Ludovic

diff --git a/src/hotspot/share/ci/ciCallProfile.hpp b/src/hotspot/share/ci/ciCallProfile.hpp
index 73854806ed..845070fbe1 100644
--- a/src/hotspot/share/ci/ciCallProfile.hpp
+++ b/src/hotspot/share/ci/ciCallProfile.hpp
@@ -38,7 +38,7 @@ private:
   friend class ciMethod;
   friend class ciMethodHandle;

-  enum { MorphismLimit = 2 }; // Max call site's morphism we care about
+  enum { MorphismLimit = 8 }; // Max call site's morphism we care about
   int  _limit;                // number of receivers have been determined
   int  _morphism;             // determined call site's morphism
   int  _count;                // # times has this call been executed
@@ -47,6 +47,7 @@ private:
   ciKlass*  _receiver[MorphismLimit + 1];  // receivers (exact)

   ciCallProfile() {
+    guarantee(MorphismLimit >= TypeProfileWidth, "MorphismLimit can't be smaller than TypeProfileWidth");
     _limit = 0;
     _morphism    = 0;
     _count = -1;
diff --git a/src/hotspot/share/ci/ciMethod.cpp b/src/hotspot/share/ci/ciMethod.cpp
index d771be8dac..8e4ecc8597 100644
--- a/src/hotspot/share/ci/ciMethod.cpp
+++ b/src/hotspot/share/ci/ciMethod.cpp
@@ -496,9 +496,7 @@ ciCallProfile ciMethod::call_profile_at_bci(int bci) {
       // Every profiled call site has a counter.
       int count = check_overflow(data->as_CounterData()->count(), java_code_at_bci(bci));

-      if (!data->is_ReceiverTypeData()) {
-        result._receiver_count[0] = 0;  // that's a definite zero
-      } else { // ReceiverTypeData is a subclass of CounterData
+      if (data->is_ReceiverTypeData()) {
         ciReceiverTypeData* call = (ciReceiverTypeData*)data->as_ReceiverTypeData();
         // In addition, virtual call sites have receiver type information
         int receivers_count_total = 0;
@@ -515,7 +513,7 @@ ciCallProfile ciMethod::call_profile_at_bci(int bci) {
           // is recorded or an associated counter is incremented, but not both. With
           // tiered compilation, however, both can happen due to the interpreter and
           // C1 profiling invocations differently. Address that inconsistency here.
-          if (morphism == 1 && count > 0) {
+          if (morphism >= 1 && count > 0) {
             epsilon = count;
             count = 0;
           }
@@ -531,25 +529,26 @@ ciCallProfile ciMethod::call_profile_at_bci(int bci) {
          // If we extend profiling to record methods,
           // we will set result._method also.
         }
+        result._morphism = morphism;
         // Determine call site's morphism.
         // The call site count is 0 with known morphism (only 1 or 2 receivers)
         // or < 0 in the case of a type check failure for checkcast, aastore, instanceof.
         // The call site count is > 0 in the case of a polymorphic virtual call.
-        if (morphism > 0 && morphism == result._limit) {
-           // The morphism <= MorphismLimit.
-           if ((morphism <  ciCallProfile::MorphismLimit) ||
-               (morphism == ciCallProfile::MorphismLimit && count == 0)) {
+        assert(result._morphism == result._limit, "");
#ifdef ASSERT
+        if (result._morphism > 0) {
+           // The morphism <= TypeProfileWidth.
+           if ((result._morphism <  TypeProfileWidth) ||
+               (result._morphism == TypeProfileWidth && count == 0)) {
              if (count > 0) {
                this->print_short_name(tty);
                tty->print_cr(" @ bci:%d", bci);
                this->print_codes();
                assert(false, "this call site should not be polymorphic");
              }
-#endif
-             result._morphism = morphism;
            }
         }
+#endif
         // Make the count consistent if this is a call profile. If count is
         // zero or less, presume that this is a typecheck profile and
         // do nothing.  Otherwise, increase count to be the sum of all
@@ -578,7 +577,7 @@ void ciCallProfile::add_receiver(ciKlass* receiver, int receiver_count) {
   }
   _receiver[i] = receiver;
   _receiver_count[i] = receiver_count;
-  if (_limit < MorphismLimit) _limit++;
+  if (_limit < TypeProfileWidth) _limit++;
}


diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp
index d605bdb7bd..7a8dee43e5 100644
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@@ -389,9 +389,16 @@
   product(bool, UseBimorphicInlining, true,                                 \
           "Profiling based inlining for two receivers")                     \
                                                                             \
+  product(bool, UsePolymorphicInlining, true,                               \
+          "Profiling based inlining for two or more receivers")             \
+                                                                            \
   product(bool, UseOnlyInlinedBimorphic, true,                              \
           "Don't use BimorphicInlining if can't inline a second method")    \
                                                                             \
+  product(bool, UseOnlyInlinedPolymorphic, true,                            \
+          "Don't use PolymorphicInlining if can't inline a non-major "      \
+          "receiver's method")                                              \
+                                                                            \
   product(bool, InsertMemBarAfterArraycopy, true,                           \
           "Insert memory barrier after arraycopy call")                     \
                                                                             \
diff --git a/src/hotspot/share/opto/doCall.cpp b/src/hotspot/share/opto/doCall.cpp
index 44ab387ac8..6f940209ce 100644
--- a/src/hotspot/share/opto/doCall.cpp
+++ b/src/hotspot/share/opto/doCall.cpp
@@ -83,25 +83,23 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool

   // See how many times this site has been invoked.
   int site_count = profile.count();
-  int receiver_count = -1;
-  if (call_does_dispatch && UseTypeProfile && profile.has_receiver(0)) {
-    // Receivers in the profile structure are ordered by call counts
-    // so that the most called (major) receiver is profile.receiver(0).
-    receiver_count = profile.receiver_count(0);
-  }

   CompileLog* log = this->log();
   if (log != NULL) {
-    int rid = (receiver_count >= 0)? log->identify(profile.receiver(0)): -1;
-    int r2id = (rid != -1 && profile.has_receiver(1))? log->identify(profile.receiver(1)):-1;
+    ResourceMark rm;
+    int* rids = NEW_RESOURCE_ARRAY(int, TypeProfileWidth);
+    for (int i = 0; i < TypeProfileWidth && profile.has_receiver(i); i++) {
+      rids[i] = log->identify(profile.receiver(i));
+    }
     log->begin_elem("call method='%d' count='%d' prof_factor='%f'",
                     log->identify(callee), site_count, prof_factor);
     if (call_does_dispatch)  log->print(" virtual='1'");
     if (allow_inline)     log->print(" inline='1'");
-    if (receiver_count >= 0) {
-      log->print(" receiver='%d' receiver_count='%d'", rid, receiver_count);
-      if (profile.has_receiver(1)) {
-        log->print(" receiver2='%d' receiver2_count='%d'", r2id, profile.receiver_count(1));
+    for (int i = 0; i < TypeProfileWidth && profile.has_receiver(i); i++) {
+      if (i == 0) {
+        log->print(" receiver='%d' receiver_count='%d'", rids[i], profile.receiver_count(i));
+      } else {
+        log->print(" receiver%d='%d' receiver%d_count='%d'", i + 1, rids[i], i + 1, profile.receiver_count(i));
       }
     }
     if (callee->is_method_handle_intrinsic()) {
@@ -205,90 +203,96 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
     if (call_does_dispatch && site_count > 0 && UseTypeProfile) {
       // The major receiver's count >= TypeProfileMajorReceiverPercent of site_count.
       bool have_major_receiver = profile.has_receiver(0) && (100.*profile.receiver_prob(0) >= (float)TypeProfileMajorReceiverPercent);
-      ciMethod* receiver_method = NULL;

       int morphism = profile.morphism();
+
+      ciMethod** receiver_methods = NEW_RESOURCE_ARRAY(ciMethod*, MAX(1, morphism));
+      memset(receiver_methods, 0, sizeof(ciMethod*) * MAX(1, morphism));
+
       if (speculative_receiver_type != NULL) {
         if (!too_many_traps_or_recompiles(caller, bci, Deoptimization::Reason_speculate_class_check)) {
           // We have a speculative type, we should be able to resolve
           // the call. We do that before looking at the profiling at
-          // this invoke because it may lead to bimorphic inlining which
+          // this invoke because it may lead to polymorphic inlining which
           // a speculative type should help us avoid.
-          receiver_method = callee->resolve_invoke(jvms->method()->holder(),
-                                                   speculative_receiver_type);
-          if (receiver_method == NULL) {
+          receiver_methods[0] = callee->resolve_invoke(jvms->method()->holder(),
+                                                       speculative_receiver_type);
+          if (receiver_methods[0] == NULL) {
             speculative_receiver_type = NULL;
           } else {
             morphism = 1;
           }
         } else {
           // speculation failed before. Use profiling at the call
-          // (could allow bimorphic inlining for instance).
+          // (could allow polymorphic inlining for instance).
           speculative_receiver_type = NULL;
         }
       }
-      if (receiver_method == NULL &&
+      if (receiver_methods[0] == NULL &&
           (have_major_receiver || morphism == 1 ||
-           (morphism == 2 && UseBimorphicInlining))) {
-        // receiver_method = profile.method();
+           (morphism == 2 && UseBimorphicInlining) ||
+           (morphism >= 2 && UsePolymorphicInlining))) {
+        assert(profile.has_receiver(0), "no receiver at 0");
+        // receiver_methods[0] = profile.method();
         // Profiles do not suggest methods now.  Look it up in the major receiver.
-        receiver_method = callee->resolve_invoke(jvms->method()->holder(),
-                                                      profile.receiver(0));
+        receiver_methods[0] = callee->resolve_invoke(jvms->method()->holder(),
+                                                          profile.receiver(0));
       }
-      if (receiver_method != NULL) {
-        // The single majority receiver sufficiently outweighs the minority.
-        CallGenerator* hit_cg = this->call_generator(receiver_method,
-              vtable_index, !call_does_dispatch, jvms, allow_inline, prof_factor);
-        if (hit_cg != NULL) {
-          // Look up second receiver.
-          CallGenerator* next_hit_cg = NULL;
-          ciMethod* next_receiver_method = NULL;
-          if (morphism == 2 && UseBimorphicInlining) {
-            next_receiver_method = callee->resolve_invoke(jvms->method()->holder(),
-                                                               profile.receiver(1));
-            if (next_receiver_method != NULL) {
-              next_hit_cg = this->call_generator(next_receiver_method,
-                                  vtable_index, !call_does_dispatch, jvms,
-                                  allow_inline, prof_factor);
-              if (next_hit_cg != NULL && !next_hit_cg->is_inline() &&
-                  have_major_receiver && UseOnlyInlinedBimorphic) {
-                  // Skip if we can't inline second receiver's method
-                  next_hit_cg = NULL;
+      if (receiver_methods[0] != NULL) {
+        CallGenerator** hit_cgs = NEW_RESOURCE_ARRAY(CallGenerator*, MAX(1, morphism));
+        memset(hit_cgs, 0, sizeof(CallGenerator*) * MAX(1, morphism));
+
+        hit_cgs[0] = this->call_generator(receiver_methods[0],
+                            vtable_index, !call_does_dispatch, jvms,
+                            allow_inline, prof_factor);
+        if (hit_cgs[0] != NULL) {
+          if ((morphism == 2 && UseBimorphicInlining) || (morphism >= 2 && UsePolymorphicInlining)) {
+            for (int i = 1; i < morphism; i++) {
+              assert(profile.has_receiver(i), "no receiver at %d", i);
+              receiver_methods[i] = callee->resolve_invoke(jvms->method()->holder(),
+                                                            profile.receiver(i));
+              if (receiver_methods[i] != NULL) {
+                hit_cgs[i] = this->call_generator(receiver_methods[i],
+                                      vtable_index, !call_does_dispatch, jvms,
+                                      allow_inline, prof_factor);
+                if (hit_cgs[i] != NULL && !hit_cgs[i]->is_inline() && have_major_receiver &&
+                    ((morphism == 2 && UseOnlyInlinedBimorphic) || (morphism >= 2 && UseOnlyInlinedPolymorphic))) {
+                  // Skip if we can't inline non-major receiver's method
+                  hit_cgs[i] = NULL;
+                }
               }
             }
           }
           CallGenerator* miss_cg;
-          Deoptimization::DeoptReason reason = (morphism == 2
-                                               ? Deoptimization::Reason_bimorphic
+          Deoptimization::DeoptReason reason = (morphism >= 2
+                                               ? Deoptimization::Reason_polymorphic
                                                : Deoptimization::reason_class_check(speculative_receiver_type != NULL));
-          if ((morphism == 1 || (morphism == 2 && next_hit_cg != NULL)) &&
-              !too_many_traps_or_recompiles(caller, bci, reason)
-             ) {
+          if (!too_many_traps_or_recompiles(caller, bci, reason)) {
             // Generate uncommon trap for class check failure path
-            // in case of monomorphic or bimorphic virtual call site.
+            // in case of polymorphic virtual call site.
             miss_cg = CallGenerator::for_uncommon_trap(callee, reason,
                         Deoptimization::Action_maybe_recompile);
           } else {
             // Generate virtual call for class check failure path
-            // in case of polymorphic virtual call site.
+            // in case of megamorphic virtual call site.
             miss_cg = CallGenerator::for_virtual_call(callee, vtable_index);
           }
-          if (miss_cg != NULL) {
-            if (next_hit_cg != NULL) {
+          for (int i = morphism - 1; i >= 1 && miss_cg != NULL; i--) {
+            if (hit_cgs[i] != NULL) {
               assert(speculative_receiver_type == NULL, "shouldn't end up here if we used speculation");
-              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
+              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_methods[i], profile.receiver(i), site_count, profile.receiver_count(i));
               // We don't need to record dependency on a receiver here and below.
               // Whenever we inline, the dependency is added by Parse::Parse().
-              miss_cg = CallGenerator::for_predicted_call(profile.receiver(1), miss_cg, next_hit_cg, PROB_MAX);
-            }
-            if (miss_cg != NULL) {
-              ciKlass* k = speculative_receiver_type != NULL ? speculative_receiver_type : profile.receiver(0);
-              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, k, site_count, receiver_count);
-              float hit_prob = speculative_receiver_type != NULL ? 1.0 : profile.receiver_prob(0);
-              CallGenerator* cg = CallGenerator::for_predicted_call(k, miss_cg, hit_cg, hit_prob);
-              if (cg != NULL)  return cg;
+              miss_cg = CallGenerator::for_predicted_call(profile.receiver(i), miss_cg, hit_cgs[i], PROB_MAX);
             }
           }
+          if (miss_cg != NULL) {
+            ciKlass* k = speculative_receiver_type != NULL ? speculative_receiver_type : profile.receiver(0);
+            trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_methods[0], k, site_count, profile.receiver_count(0));
+            float hit_prob = speculative_receiver_type != NULL ? 1.0 : profile.receiver_prob(0);
+            CallGenerator* cg = CallGenerator::for_predicted_call(k, miss_cg, hit_cgs[0], hit_prob);
+            if (cg != NULL)  return cg;
+          }
         }
      }
     }
diff --git a/src/hotspot/share/runtime/deoptimization.cpp b/src/hotspot/share/runtime/deoptimization.cpp
index 11df15e004..2d14b52854 100644
--- a/src/hotspot/share/runtime/deoptimization.cpp
+++ b/src/hotspot/share/runtime/deoptimization.cpp
@@ -2382,7 +2382,7 @@ const char* Deoptimization::_trap_reason_name[] = {
   "class_check",
   "array_check",
   "intrinsic" JVMCI_ONLY("_or_type_checked_inlining"),
-  "bimorphic" JVMCI_ONLY("_or_optimized_type_check"),
+  "polymorphic" JVMCI_ONLY("_or_optimized_type_check"),
   "profile_predicate",
   "unloaded",
   "uninitialized",
diff --git a/src/hotspot/share/runtime/deoptimization.hpp b/src/hotspot/share/runtime/deoptimization.hpp
index 1cfff5394e..c1eb998aba 100644
--- a/src/hotspot/share/runtime/deoptimization.hpp
+++ b/src/hotspot/share/runtime/deoptimization.hpp
@@ -60,12 +60,12 @@ class Deoptimization : AllStatic {
     Reason_class_check,           // saw unexpected object class (@bci)
     Reason_array_check,           // saw unexpected array class (aastore @bci)
     Reason_intrinsic,             // saw unexpected operand to intrinsic (@bci)
-    Reason_bimorphic,             // saw unexpected object class in bimorphic inlining (@bci)
+    Reason_polymorphic,           // saw unexpected object class in bimorphic inlining (@bci)

#if INCLUDE_JVMCI
     Reason_unreached0             = Reason_null_assert,
     Reason_type_checked_inlining  = Reason_intrinsic,
-    Reason_optimized_type_check   = Reason_bimorphic,
+    Reason_optimized_type_check   = Reason_polymorphic,
#endif

     Reason_profile_predicate,     // compiler generated predicate moved from frequent branch in a loop failed
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 94b544824e..ee761626c4 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -2388,7 +2388,7 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
   declare_constant(Deoptimization::Reason_class_check)                    \
   declare_constant(Deoptimization::Reason_array_check)                    \
   declare_constant(Deoptimization::Reason_intrinsic)                      \
-  declare_constant(Deoptimization::Reason_bimorphic)                      \
+  declare_constant(Deoptimization::Reason_polymorphic)                    \
   declare_constant(Deoptimization::Reason_profile_predicate)              \
   declare_constant(Deoptimization::Reason_unloaded)                       \
   declare_constant(Deoptimization::Reason_uninitialized)                  \


More information about the hotspot-compiler-dev mailing list