scalar replacement of arrays affected by minor changes to surrounding code

Wed Nov 27 19:59:11 UTC 2019

Hi,

Recently, I was analyzing the performance of a workload and noticed a significant
amount of allocation from a single array allocation site. This allocation does not survive
the simple method which allocates it. A simplified version of the code is very similar to
the code being described by Govind. 

From this conversation I noticed JDK-8231291 and the patch provided by Roland.
I applied the patch to tip and after a few minor additions it indeed provides measurable
improvements to the workload I was measuring and passes all of my testing.

Roland is there anything I could do to help get this type of change into tip and any
appropriate back ports? I have tested some workloads with this patch and I think
I covered the required checks but I could have easily missed something.

Here is a modified version of Roland's original patch:

diff --git a/src/hotspot/share/opto/compile.cpp b/src/hotspot/share/opto/compile.cpp
index 08552e0756..98bf37f6e0 100644
--- a/src/hotspot/share/opto/compile.cpp
+++ b/src/hotspot/share/opto/compile.cpp
@@ -2292,7 +2292,7 @@ void Compile::Optimize() {
     if (has_loops()) {
       // Cleanup graph (remove dead nodes).
       TracePhase tp("idealLoop", &timers[_t_idealLoop]);
-      PhaseIdealLoop::optimize(igvn, LoopOptsNone);
+      PhaseIdealLoop::optimize(igvn, LoopOptsMaxUnroll);
       if (major_progress()) print_method(PHASE_PHASEIDEAL_BEFORE_EA, 2);
       if (failing())  return;
     }
diff --git a/src/hotspot/share/opto/compile.hpp b/src/hotspot/share/opto/compile.hpp
index 50906800ba..710aec8e3a 100644
--- a/src/hotspot/share/opto/compile.hpp
+++ b/src/hotspot/share/opto/compile.hpp
@@ -93,6 +93,7 @@ struct Final_Reshape_Counts;
 enum LoopOptsMode {
   LoopOptsDefault,
   LoopOptsNone,
+  LoopOptsMaxUnroll,
   LoopOptsShenandoahExpand,
   LoopOptsShenandoahPostExpand,
   LoopOptsSkipSplitIf,
diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp
index 2038a4c8f9..3c13a9aa10 100644
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@@ -2796,6 +2796,7 @@ bool PhaseIdealLoop::process_expensive_nodes() {
 void PhaseIdealLoop::build_and_optimize(LoopOptsMode mode) {
   bool do_split_ifs = (mode == LoopOptsDefault);
   bool skip_loop_opts = (mode == LoopOptsNone);
+  bool do_max_unroll = (mode == LoopOptsMaxUnroll);
 
   int old_progress = C->major_progress();
   uint orig_worklist_size = _igvn._worklist.size();
@@ -2859,7 +2860,7 @@ void PhaseIdealLoop::build_and_optimize(LoopOptsMode mode) {
 
   BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
   // Nothing to do, so get out
-  bool stop_early = !C->has_loops() && !skip_loop_opts && !do_split_ifs && !_verify_me && !_verify_only &&
+  bool stop_early = !C->has_loops() && !skip_loop_opts && !do_max_unroll && !do_split_ifs && !_verify_me && !_verify_only &&
     !bs->is_gc_specific_loop_opts_pass(mode);
   bool do_expensive_nodes = C->should_optimize_expensive_nodes(_igvn);
   bool strip_mined_loops_expanded = bs->strip_mined_loops_expanded(mode);
@@ -3009,6 +3010,44 @@ void PhaseIdealLoop::build_and_optimize(LoopOptsMode mode) {
     return;
   }
 
+  if (do_max_unroll) {
+    for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
+      IdealLoopTree* lpt = iter.current();
+      if (lpt->is_innermost() && lpt->_allow_optimizations && !lpt->_has_call && lpt->is_counted()) {
+        lpt->compute_trip_count(this);
+
+        if (lpt->do_one_iteration_loop(this)) {
+          continue;
+        }
+
+        if (lpt->do_remove_empty_loop(this)) {
+          continue;
+        }
+        AutoNodeBudget node_budget(this);
+        CountedLoopNode *cl = lpt->_head->as_CountedLoop();
+        // Do not do anything for invalid, pre or post loops
+        if (cl->is_valid_counted_loop() && !cl->is_pre_loop() && !cl->is_post_loop()) {
+          // Compute loop trip count from profile data
+          lpt->compute_profile_trip_cnt(this);
+          if (cl->is_normal_loop()) {
+            if (lpt->policy_maximally_unroll(this)) {
+              memset(worklist.adr(), 0, worklist.Size()*sizeof(Node*));
+              do_maximally_unroll(lpt, worklist);
+            }
+          }
+        }
+      }
+    }
+
+    C->restore_major_progress(old_progress);
+    _igvn.optimize();
+
+    if (C->log() != NULL) {
+      log_loop_tree(_ltree_root, _ltree_root, C->log());
+    }
+    return;
+  }
+
   if (bs->optimize_loops(this, mode, visited, nstack, worklist)) {
     _igvn.optimize();
     if (C->log() != NULL) {

Thanks,
Charlie Gracie

On 2019-09-20, 4:38 AM, "hotspot-compiler-dev on behalf of Roland Westrelin" <hotspot-compiler-dev-bounces at openjdk.java.net on behalf of rwestrel at redhat.com> wrote:

    
    > The problem with that one is that EA would need the loop to be fully
    > unrolled to eliminate the allocation but that only happens after EA. So
    > it's a pass ordering problem. We already run a pass of loop
    > optimizations before EA so it seems we could have it take care of fully
    > unrolling the loop.
    
    I created JDK-8231291 for that one.
    
    Roland.