SFX and address cloning problem

Aleksey Shipilev shade at redhat.com
Fri Jul 5 11:33:01 UTC 2019


Hi,

Chasing the performance regressions with SFX, I was looking at the generated code for the simplest
test case from gc-bench:

http://icedtea.classpath.org/hg/gc-bench/file/369cf7ad42af/src/main/java/org/openjdk/gcbench/runtime/reads/Plain.java#l103

Runs with current sh/jdk like this:

$ java -jar target/benchmarks.jar reads.Plain.test_Object -f 1 --jvmArgs
"-XX:+UnlockExperimentalVMOptions -XX:+UnlockDiagnosticVMOptions -XX:+UseShenandoahGC
-XX:+ShenandoahSelfFixing -Xmx1g -XX:-UseCompressedOops" -prof perfasm:printMargin=20

Generated code is:

               [Verified Entry Point]
......................... prolog ..............................
  3.11%      0x00007fa8dbb28630:   mov    %eax,-0x14000(%rsp)
  4.40%      0x00007fa8dbb28637:   push   %rbp
  0.92%      0x00007fa8dbb28638:   sub    $0x10,%rsp
.................... first read ..............................
  4.08%      0x00007fa8dbb2863c:   add    $0x10,%rsi         ; %rsi = addr
  1.92%      0x00007fa8dbb28640:   mov    (%rsi),%r11        ; %r11 = oop
  1.03%      0x00007fa8dbb28643:   movabs $0x7fa8740ae000,%r12
  1.97%      0x00007fa8dbb2864d:   testb  $0x1,0x20(%r15)
  2.56%  ╭   0x00007fa8dbb28652:   jne    0x00007fa8dbb2867c
.................. second read ..............................
  1.74%  │↗  0x00007fa8dbb28654:   test   %r11,%r11           ; null-check
         ││  0x00007fa8dbb28657:   je     0x00007fa8dbb286cf
  1.26%  ││  0x00007fa8dbb28659:   add    $0x30,%r11          ; %r11 = addr
  2.29%  ││  0x00007fa8dbb2865d:   mov    (%r11),%rsi         ; %rsi = oop
  3.74%  ││  0x00007fa8dbb28660:   testb  $0x1,0x20(%r15)
  1.10%  ││  0x00007fa8dbb28665:   jne    0x00007fa8dbb286a4
....................... sink ..............................
  1.05%  ││  0x00007fa8dbb28667:   callq  0x00007fa8dbb28300
..................... epilog ..............................
  2.59%  ││  0x00007fa8dbb2866c:   add    $0x10,%rsp
  2.15%  ││  0x00007fa8dbb28670:   pop    %rbp
  1.86%  ││  0x00007fa8dbb28671:   mov    0x108(%r15),%r10
  2.95%  ││  0x00007fa8dbb28678:   test   %eax,(%r10)
  1.12%  ││  0x00007fa8dbb2867b:   retq
...................... LRB ..................................
         ↘│  0x00007fa8dbb2867c:   mov    %r11,%r10
          │  0x00007fa8dbb2867f:   shr    $0x13,%r10
          │  0x00007fa8dbb28683:   cmpb   $0x0,(%r12,%r10,1)
          ╰  0x00007fa8dbb28688:   je     0x00007fa8dbb28654
             0x00007fa8dbb2868a:   test   %r11,%r11
             0x00007fa8dbb2868d:   je     0x00007fa8dbb286cf
             0x00007fa8dbb2868f:   mov    %r11,%rdi
             0x00007fa8dbb28692:   movabs $0x7fa8f1d92d70,%r10
             0x00007fa8dbb2869c:   callq  *%r10

The absence of implicit null check in both reads, and separate address calculation is intriguing.
Without SFX, address calculation would be folded into complex memory operand, and implicit NPE would
be there. I suspected that attaching the LRB input to the AddP in address calculation broke the
matching rules.

So, I tried to clone it:

diff -r 6fa068615a94 src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp  Wed Jul 03 17:19:16 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp  Fri Jul 05 11:51:13 2019 +0200
@@ -1530,6 +1530,12 @@
     } else {
       addr = phase->igvn().zerocon(T_OBJECT);
     }
+    if (addr->Opcode() == Op_AddP) {
+      addr = addr->clone();
+      addr->set_req(0, ctrl); // Critical? Bad generated code without it...
+      phase->register_new_node(addr, ctrl);
+    }
+
     call_lrb_stub(ctrl, fwd, addr, result_mem, raw_mem, phase);
     region->init_req(_evac_path, ctrl);
     val_phi->init_req(_evac_path, fwd);


And both the generated code in simplest example *and* performance drop with SFX on SPECjvm is gone!
Yay! This is the generated code with AddP clone:

            [Verified Entry Point]
......................... prolog ..............................
  0.71%      0x00007f3f57b263b0:   mov    %eax,-0x14000(%rsp)
  4.94%      0x00007f3f57b263b7:   push   %rbp
  1.02%      0x00007f3f57b263b8:   sub    $0x10,%rsp
.................... first read ..............................
  1.75%      0x00007f3f57b263bc:   mov    0x10(%rsi),%r11      ; !!!!
  2.18%      0x00007f3f57b263c0:   movabs $0x7f3ef0181000,%r12
  0.77%      0x00007f3f57b263ca:   testb  $0x1,0x20(%r15)
  0.27%  ╭   0x00007f3f57b263cf:   jne    0x00007f3f57b263f4
.................. second read ..............................
  1.86%  │↗  0x00007f3f57b263d1:   mov    0x30(%r11),%r8       ; !!!!
  2.91%  ││  0x00007f3f57b263d5:   testb  $0x1,0x20(%r15)
  0.71%  ││  0x00007f3f57b263da:   jne    0x00007f3f57b26420
......................... sink ..............................
  0.53%  ││  0x00007f3f57b263dc:   mov    %r8,%rsi
  0.95%  ││  0x00007f3f57b263df:   callq  0x00007f3f4ff9e880
....................... epilog ..............................
  0.64%  ││  0x00007f3f57b263e4:   add    $0x10,%rsp
  1.65%  ││  0x00007f3f57b263e8:   pop    %rbp
  3.33%  ││  0x00007f3f57b263e9:   mov    0x108(%r15),%r10
  0.40%  ││  0x00007f3f57b263f0:   test   %eax,(%r10)
  7.24%  ││  0x00007f3f57b263f3:   retq
......................... LRB ..............................
         ↘│  0x00007f3f57b263f4:   mov    %r11,%r10
          │  0x00007f3f57b263f7:   shr    $0x13,%r10
          │  0x00007f3f57b263fb:   cmpb   $0x0,(%r12,%r10,1)
          ╰  0x00007f3f57b26400:   je     0x00007f3f57b263d1
             0x00007f3f57b26402:   test   %r11,%r11
             0x00007f3f57b26405:   je     0x00007f3f57b2644f
             0x00007f3f57b26407:   add    $0x10,%rsi           ; compute addr here
             0x00007f3f57b2640b:   mov    %r11,%rdi
             0x00007f3f57b2640e:   movabs $0x7f3f6ddced70,%r10
             0x00007f3f57b26418:   callq  *%r10


So this *almost* works, except it crashes some tests:

$ CONF=linux-x86_64-server-fastdebug make images test
TEST=gc/shenandoah/options/TestSelectiveBarrierFlags.java
...
#  Internal Error (/home/shade/trunks/shenandoah-jdk/src/hotspot/share/opto/matcher.cpp:1870),
pid=28418, tid=28430
#  assert(kid == __null || s->_leaf->in(0) == __null) failed: internal operands have no control

So, what would be the proper way to clone the AddP here?

-- 
Thanks,
-Aleksey



More information about the shenandoah-dev mailing list