RFR: 8341697: C2: Register allocation inefficiency in tight loop

Quan Anh Mai qamai at openjdk.org
Fri Oct 11 16:01:09 UTC 2024


On Fri, 11 Oct 2024 15:50:20 GMT, Quan Anh Mai <qamai at openjdk.org> wrote:

> Hi,
> 
> This patch improves the spill placement in the presence of loops. Currently, when trying to spill a live range, we will create a `Phi` at the loop head, this `Phi` will then be spilt inside the loop body, and as the `Phi` is `UP` (lives in register) at the loop head, we need to emit an additional reload at the loop back-edge block. This introduces loop-carried dependencies, greatly reduces loop throughput. My proposal is that if a node is not reassigned inside a loop, and will be spilt there, we spill it eagerly at the loop entry instead. This can lead to more reload inside the loop, but as the loop-carried dependencies are eliminated, a load is negligible.
> 
> Please take a look and leave your reviews, thanks a lot.

The benchmark result:

    Benchmark                          Mode  Cnt    Score     Error  Units
    LoopCounterBench.field_ret         avgt    3  417.865 ±   2.914  ns/op
    LoopCounterBench.localVar_ret      avgt    3  332.657 ± 109.310  ns/op

The inner loop is free of spills because it has been hoisted to the loop entry:

                  │  0x00007fdf9821b546:   mov    r9d,DWORD PTR [r11+0xc]      ;*getfield increment {reexecute=0 rethrow=0 return_oop=0}
                  │                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 1 (line 56)
                  │                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
       0.03%      │  0x00007fdf9821b54a:   mov    esi,DWORD PTR [r12+r8*8+0xc] ; implicit exception: dispatches to 0x00007fdf9821b6f4
                  │                                                            ;*lastore {reexecute=0 rethrow=0 return_oop=0}
                  │                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 27 (line 58)
                  │                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
                  │  0x00007fdf9821b54f:   lea    rax,[r12+r14*8]
                  │  0x00007fdf9821b553:   lea    r13,[r12+r8*8]
       0.03%      │  0x00007fdf9821b557:   xor    edi,edi
    THE SPILL     │  0x00007fdf9821b559:   vmovq  xmm0,rbp
                  │  0x00007fdf9821b55e:   xchg   ax,ax                        ;*aload_0 {reexecute=0 rethrow=0 return_oop=0}
                  │                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 16 (line 58)
                  │                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
                 ↗│  0x00007fdf9821b560:   cmp    edi,r10d
       1.66%    ╭││  0x00007fdf9821b563:   jae    0x00007fdf9821b587
                │││  0x00007fdf9821b565:   mov    rbp,QWORD PTR [rax+rdi*8+0x10];*laload {reexecute=0 rethrow=0 return_oop=0}
                │││                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 26 (line 58)
                │││                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
       5.43%    │││  0x00007fdf9821b56a:   cmp    edi,esi
       0.17%    │││  0x00007fdf9821b56c:   jae    0x00007fdf9821b5c8
                │││  0x00007fdf9821b56e:   mov    QWORD PTR [r13+rdi*8+0x10],rbp;*goto {reexecute=0 rethrow=0 return_oop=0}
                │││                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 32 (line 57)
                │││                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
       1.40%    │││  0x00007fdf9821b573:   add    edi,r9d                      ;*iadd {reexecute=0 rethrow=0 return_oop=0}
                │││                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 30 (line 57)
                │││                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
       3.40%    │││  0x00007fdf9821b576:   mov    rbp,QWORD PTR [r15+0x450]    ; ImmutableOopMap {r11=Oop r8=NarrowOop rcx=Oop rbx=Oop rdx=Oop rax=Oop r13=Oop r14=NarrowOop }
                │││                                                            ;*goto {reexecute=1 rethrow=0 return_oop=0}
                │││                                                            ; - (reexecute) org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 32 (line 57)
                │││                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
       1.80%    │││  0x00007fdf9821b57d:   test   DWORD PTR [rbp+0x0],eax      ;*goto {reexecute=0 rethrow=0 return_oop=0}
                │││                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 32 (line 57)
                │││                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)
                │││                                                            ;   {poll}
      84.42%    │││  0x00007fdf9821b580:   cmp    edi,r10d
       0.30%    │╰│  0x00007fdf9821b583:   jl     0x00007fdf9821b560           ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                │ │                                                            ; - org.openjdk.bench.vm.compiler.LoopCounterBench::localVar_ret at 13 (line 57)
                │ │                                                            ; - org.openjdk.bench.vm.compiler.jmh_generated.LoopCounterBench_localVar_ret_jmhTest::localVar_ret_avgt_jmhStub at 17 (line 190)

-------------

PR Comment: https://git.openjdk.org/jdk/pull/21472#issuecomment-2407697430


More information about the hotspot-compiler-dev mailing list