RFR: 8267687: ModXNode::Ideal optimization is better than Parse::do_irem [v2]

Fri Jun 4 20:35:02 UTC 2021

On Fri, 4 Jun 2021 12:25:20 GMT, Yi Yang <yyang at openjdk.org> wrote:

>> Hi all,
>> 
>> Can I have a review of this change? I noticed there are two almost the same optimizations for % operation. For x%y, both Parse::do_irem and ModXNode::ideal are optimized for a special case that divisor y is `2^n` constant value. It turns out that ModXNode::Ideal optimization is better than Parse::do_irem in a simple microbenchmark(Please check out JBS attachment for more detailed benchmark result), 
>> 
>> ### Implementation
>> - ModXNode::Ideal opt:
>> https://github.com/openjdk/jdk/blob/cc687fd43ade6be8760c559f3ffa909c5937727c/src/hotspot/share/opto/divnode.cpp#L112-L160
>> 
>> - Parse::do_irem opt:
>> https://github.com/openjdk/jdk/blob/cc687fd43ade6be8760c559f3ffa909c5937727c/src/hotspot/share/opto/parse2.cpp#L1171-L1196
>> 
>> ### JMH Microbenchmark
>> - x64
>> 
>> ModXNode::Ideal opt:
>> Benchmark                         Mode  Cnt     Score     Error  Units
>> ModPowerOf2.testNegativePowerOf2  avgt   25  8746.608 ± 139.777  ns/op
>> ModPowerOf2.testPositivePowerOf2  avgt   25  8735.545 ±  86.145  ns/op
>> 
>> Parse::do_irem opt:
>> Benchmark                         Mode  Cnt     Score   Error  Units
>> ModPowerOf2.testNegativePowerOf2  avgt   25  8693.797 ± 7.844  ns/op
>> ModPowerOf2.testPositivePowerOf2  avgt   25  6618.652 ± 1.739  ns/op
>> 
>> 
>> - AArch64
>> 
>> ModXNode::Ideal opt:
>> Benchmark                         Mode  Cnt     Score   Error  Units
>> ModPowerOf2.testNegativePowerOf2  avgt   25  6758.299 ± 4.452  ns/op
>> ModPowerOf2.testPositivePowerOf2  avgt   25  6759.533 ± 2.791  ns/op
>> 
>> Parse::do_irem opt:
>> Benchmark                         Mode  Cnt     Score   Error  Units
>> ModPowerOf2.testNegativePowerOf2  avgt   25  6757.868 ± 4.044  ns/op
>> ModPowerOf2.testPositivePowerOf2  avgt   25  6546.034 ± 0.794  ns/op
>> 
>> 
>> In summary, ModXNode::Ideal is better than Parse::do_irem on x64 platform, benchmark results are also reproducible. I guess the reason is that Parse::do_irem generates cmp and jmp while ModXNode::Ideal does not. On AArch64 platform, the two optimizations are almost the same according to benchmark results. I am not familiar with AArch64 so I can not give a constructive answer.
>> 
>> 
>> ### Assembly
>> - source code
>> 
>> static int foo15(int k){
>>     return k%8;
>> }
>> 
>> 
>> - x64
>> 
>> ModXNode::Ideal opt:
>> 
>> [Verified Entry Point]
>>   # {method} {0x00007f0ded401ec8} 'foo15' '(I)I' in 'CheckIndex'
>>   # parm0:    rsi       = int
>>   #           [sp+0x20]  (sp of caller)
>>   0x00007f0e4952e780:   sub    $0x18,%rsp
>>   0x00007f0e4952e787:   mov    %rbp,0x10(%rsp)              ;*synchronization entry
>>                                                             ; - CheckIndex::foo15 at -1 (line 125)
>>   0x00007f0e4952e78c:   mov    %esi,%r10d
>>   0x00007f0e4952e78f:   sar    $0x1f,%r10d
>>   0x00007f0e4952e793:   shr    $0x1d,%r10d
>>   0x00007f0e4952e797:   add    %esi,%r10d
>>   0x00007f0e4952e79a:   and    $0xfffffffffffffff8,%r10d
>>   0x00007f0e4952e79e:   mov    %esi,%eax
>>   0x00007f0e4952e7a0:   sub    %r10d,%eax                   ;*irem {reexecute=0 rethrow=0 return_oop=0}
>>                                                             ; - CheckIndex::foo15 at 3 (line 125)
>>   0x00007f0e4952e7a3:   add    $0x10,%rsp
>>   0x00007f0e4952e7a7:   pop    %rbp
>>   0x00007f0e4952e7a8:   cmp    0x340(%r15),%rsp             ;   {poll_return}
>>   0x00007f0e4952e7af:   ja     0x00007f0e4952e7b6
>>   0x00007f0e4952e7b5:   retq   
>>   0x00007f0e4952e7b6:   mov    $0x7f0e4952e7a8,%r10         ;   {internal_word}
>>   0x00007f0e4952e7c0:   mov    %r10,0x358(%r15)
>>   0x00007f0e4952e7c7:   jmpq   0x00007f0e49506100           ;   {runtime_call SafepointBlob}
>>   0x00007f0e4952e7cc:   hlt  
>>   						...   ; omit remaining hlt(s)
>>   
>> 
>> 
>> Parse::do_irem opt:
>> 
>> [Verified Entry Point]
>>   # {method} {0x00007fbab9c01ec8} 'foo15' '(I)I' in 'CheckIndex'
>>   # parm0:    rsi       = int
>>   #           [sp+0x20]  (sp of caller)
>>   0x00007fbb1552db00:   sub    $0x18,%rsp
>>   0x00007fbb1552db07:   mov    %rbp,0x10(%rsp)              ;*synchronization entry
>>                                                             ; - CheckIndex::foo15 at -1 (line 125)
>>   0x00007fbb1552db0c:   test   %esi,%esi
>>   0x00007fbb1552db0e:   jge    0x00007fbb1552db2c
>>   0x00007fbb1552db10:   neg    %esi
>>   0x00007fbb1552db12:   and    $0x7,%esi
>>   0x00007fbb1552db15:   mov    %esi,%eax
>>   0x00007fbb1552db17:   neg    %eax
>>   0x00007fbb1552db19:   add    $0x10,%rsp
>>   0x00007fbb1552db1d:   pop    %rbp
>>   0x00007fbb1552db1e:   cmp    0x340(%r15),%rsp             ;   {poll_return}
>>   0x00007fbb1552db25:   ja     0x00007fbb1552db33
>>   0x00007fbb1552db2b:   retq   
>>   0x00007fbb1552db2c:   mov    %esi,%eax
>>   0x00007fbb1552db2e:   and    $0x7,%eax                    ;*irem {reexecute=0 rethrow=0 return_oop=0}
>>                                                             ; - CheckIndex::foo15 at 3 (line 125)
>>   0x00007fbb1552db31:   jmp    0x00007fbb1552db19
>>   0x00007fbb1552db33:   mov    $0x7fbb1552db1e,%r10         ;   {internal_word}
>>   0x00007fbb1552db3d:   mov    %r10,0x358(%r15)
>>   0x00007fbb1552db44:   jmpq   0x00007fbb15505100           ;   {runtime_call SafepointBlob}
>>   0x00007fbb1552db49:   hlt    
>> 						...   ; omit remaining hlt(s)   
>> 
>> 
>> - AArch64
>> 
>> ModXNode::Ideal opt:
>> 
>> [Verified Entry Point]
>>   # {method} {0x0000ffff3d000218} 'foo15' '(I)I' in 'CheckIndex'
>>   # parm0:    c_rarg1   = int
>>   #           [sp+0x20]  (sp of caller)
>>   0x0000ffff9d4f7580:   nop
>>   0x0000ffff9d4f7584:   sub	sp, sp, #0x20
>>   0x0000ffff9d4f7588:   stp	x29, x30, [sp, #16]         ;*synchronization entry
>>                                                             ; - CheckIndex::foo15 at -1 (line 3)
>>   0x0000ffff9d4f758c:   asr	w10, w1, #31
>>   0x0000ffff9d4f7590:   add	w11, w1, w10, lsr #29
>>   0x0000ffff9d4f7594:   and	w10, w11, #0xfffffff8
>>   0x0000ffff9d4f7598:   sub	w0, w1, w10                 ;*irem {reexecute=0 rethrow=0 return_oop=0}
>>                                                             ; - CheckIndex::foo15 at 3 (line 3)
>>   0x0000ffff9d4f759c:   ldp	x29, x30, [sp, #16]
>>   0x0000ffff9d4f75a0:   add	sp, sp, #0x20
>>   0x0000ffff9d4f75a4:   ldr	x8, [x28, #832]             ;   {poll_return}
>>   0x0000ffff9d4f75a8:   cmp	sp, x8
>>   0x0000ffff9d4f75ac:   b.hi	0x0000ffff9d4f75b4  // b.pmore
>>   0x0000ffff9d4f75b0:   ret
>>   0x0000ffff9d4f75b4:   adr	x8, 0x0000ffff9d4f75a4      ;   {internal_word}
>>   0x0000ffff9d4f75b8:   str	x8, [x28, #856]
>>   0x0000ffff9d4f75bc:   b	0x0000ffff9d4cd600          ;   {runtime_call SafepointBlob}
>> 
>> 
>> Parse::do_irem opt:
>> 
>> [Verified Entry Point]
>>   # {method} {0x0000ffff1c800218} 'foo15' '(I)I' in 'CheckIndex'
>>   # parm0:    c_rarg1   = int
>>   #           [sp+0x20]  (sp of caller)
>>   0x0000ffff7d4f7580:   nop
>>   0x0000ffff7d4f7584:   sub	sp, sp, #0x20
>>   0x0000ffff7d4f7588:   stp	x29, x30, [sp, #16]         ;*synchronization entry
>>                                                             ; - CheckIndex::foo15 at -1 (line 3)
>>   0x0000ffff7d4f758c:   tbz	w1, #31, 0x0000ffff7d4f75b4
>>   0x0000ffff7d4f7590:   neg	w10, w1
>>   0x0000ffff7d4f7594:   and	w11, w10, #0x7
>>   0x0000ffff7d4f7598:   neg	w0, w11
>>   0x0000ffff7d4f759c:   ldp	x29, x30, [sp, #16]
>>   0x0000ffff7d4f75a0:   add	sp, sp, #0x20
>>   0x0000ffff7d4f75a4:   ldr	x8, [x28, #832]             ;   {poll_return}
>>   0x0000ffff7d4f75a8:   cmp	sp, x8
>>   0x0000ffff7d4f75ac:   b.hi	0x0000ffff7d4f75bc  // b.pmore
>>   0x0000ffff7d4f75b0:   ret
>>   0x0000ffff7d4f75b4:   and	w0, w1, #0x7                ;*irem {reexecute=0 rethrow=0 return_oop=0}
>>                                                             ; - CheckIndex::foo15 at 3 (line 3)
>>   0x0000ffff7d4f75b8:   b	0x0000ffff7d4f759c
>>   0x0000ffff7d4f75bc:   adr	x8, 0x0000ffff7d4f75a4      ;   {internal_word}
>>   0x0000ffff7d4f75c0:   str	x8, [x28, #856]
>>   0x0000ffff7d4f75c4:   b	0x0000ffff7d4cd600          ;   {runtime_call SafepointBlob}
>>   0x0000ffff7d4f75c8:   .inst	0x00000000 ; undefined
>>                         ...   ; omit remaining .inst(s)
>> 
>> 
>> ### Ideal graph:
>> ----------------
>> ![ideal_graph](https://user-images.githubusercontent.com/5010047/119525589-34585f80-bdb1-11eb-9d7e-e3962cd7f789.jpg)
>> (Parse opt(left) vs Ideal opt(right))
>> 
>> Thanks!
>> Yang
>
> Yi Yang has updated the pull request incrementally with one additional commit since the last revision:
> 
>   mixed computations loop

Thank you for updated test and new numbers.

-------------

Marked as reviewed by kvn (Reviewer).

PR: https://git.openjdk.java.net/jdk/pull/4188