JDK-8214239 (?): Missing x86_64.ad patterns for clearing and setting long vector bits

Vladimir Kozlov vladimir.kozlov at oracle.com
Wed Nov 6 18:44:47 UTC 2019


Hi Bernard,

It is interesting suggestion. I don't see we use BTR and BTS currently. Sandhya, do these instructions has some 
limitations/restrictions?

Regarding changes. For new code we prefer to have new encoding of macroasm instructions used in .ad files instead of 
opcodes [1]. This way we make sure correct encoding is used on different CPUs.

Thanks,
Vladimir

[1] http://hg.openjdk.java.net/jdk/jdk/file/38d4202154f2/src/hotspot/cpu/x86/x86_64.ad#l10051

On 11/2/19 10:18 AM, B. Blaser wrote:
> Hi,
> 
> I experimented, some time ago, with an optimization of several common
> flag patterns (see also JBS) using BTR/BTS instead of AND/OR
> instructions on x86_64 xeon:
> 
> @BenchmarkMode(Mode.AverageTime)
> @OutputTimeUnit(TimeUnit.NANOSECONDS)
> @State(Scope.Thread)
> public class BitSetAndReset {
>      private static final int COUNT = 10_000;
> 
>      private static final long MASK63 = 0x8000_0000_0000_0000L;
>      private static final long MASK31 = 0x0000_0000_8000_0000L;
>      private static final long MASK15 = 0x0000_0000_0000_8000L;
>      private static final long MASK00 = 0x0000_0000_0000_0001L;
> 
>      private long andq, orq;
>      private boolean success = true;
> 
>      @TearDown(Level.Iteration)
>      public void finish() {
>          if (!success)
>              throw new AssertionError("Failure while setting or
> clearing long vector bits!");
>      }
> 
>      @Benchmark
>      public void bitSet(Blackhole bh) {
>          for (int i=0; i<COUNT; i++) {
>              andq = MASK63 | MASK31 | MASK15 | MASK00;
>              orq = 0;
>              bh.consume(test63());
>              bh.consume(test31());
>              bh.consume(test15());
>              bh.consume(test00());
>              success &= andq == 0 && orq == (MASK63 | MASK31 | MASK15 | MASK00);
>          }
>      }
> 
>      private long test63() {
>          andq &= ~MASK63;
>          orq |= MASK63;
>          return 0L;
>      }
>      private long test31() {
>          andq &= ~MASK31;
>          orq |= MASK31;
>          return 0L;
>      }
>      private long test15() {
>          andq &= ~MASK15;
>          orq |= MASK15;
>          return 0L;
>      }
>      private long test00() {
>          andq &= ~MASK00;
>          orq |= MASK00;
>          return 0L;
>      }
> }
> 
> Running the benchmark this way:
> 
> $ make test TEST="micro:vm.compiler.BitSetAndReset"
> MICRO="VM_OPTIONS='-XX:CompileCommand=print,org/openjdk/bench/vm/compiler/BitSetAndReset.*test*';FORK=3;WARMUP_ITER=1;ITER=3"
> 
> We had before:
> 
> 03e       movq    R10, #9223372036854775807    # long
> 048       andq    [RSI + #16 (8-bit)], R10    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 04c       movq    R10, #-9223372036854775808    # long
> 056       orq     [RSI + #24 (8-bit)], R10    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 05a       ...
> 
> => 28 bytes
> 
> 03c       xorl    RAX, RAX    # long
> 03e       movq    R10, #-2147483649    # long
> 048       andq    [RSI + #16 (8-bit)], R10    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 04c       movl    R10, #2147483648    # long (unsigned 32-bit)
> 052       orq     [RSI + #24 (8-bit)], R10    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 056       ...
> 
> => 26 bytes
> 
> 03c       andq    [RSI + #16 (8-bit)], #-32769    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 044       orq     [RSI + #24 (8-bit)], #32768    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 04c       ...
> 
> 03c       andq    [RSI + #16 (8-bit)], #-2    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 041       orq     [RSI + #24 (8-bit)], #1    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 046       ...
> 
> Benchmark              Mode  Cnt      Score      Error  Units
> BitSetAndReset.bitSet  avgt    9  78083.773 ± 2182.692  ns/op
> 
> And we would have after:
> 
> 03c       btrq    [RSI + #16 (8-bit)], log2(not(#9223372036854775807))
>     # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 042       btsq    [RSI + #24 (8-bit)], log2(#-9223372036854775808)
> # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 048       ...
> 
> => 12 bytes
> 
> 03c       btrq    [RSI + #16 (8-bit)], log2(not(#-2147483649))    #
> long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 042       xorl    RAX, RAX    # long
> 044       movl    R10, #2147483648    # long (unsigned 32-bit)
> 04a       orq     [RSI + #24 (8-bit)], R10    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 04e       ...
> 
> => 18 bytes
> 
> 03c       andq    [RSI + #16 (8-bit)], #-32769    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 044       orq     [RSI + #24 (8-bit)], #32768    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 04c       ...
> 
> 03c       andq    [RSI + #16 (8-bit)], #-2    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 041       orq     [RSI + #24 (8-bit)], #1    # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 046       ...
> 
> Benchmark              Mode  Cnt      Score     Error  Units
> BitSetAndReset.bitSet  avgt    9  77355.154 ± 252.503  ns/op
> 
> We see a tiny performance gain with BTR/BTS but the major interest
> remains the much better encoding with up to 16 bytes saving for pure
> 64-bit immediates along with a lower register consumption.
> 
> Does the patch below look reasonable enough to eventually rebase and
> push it to jdk/submit and to post a RFR maybe soon if all goes well?
> 
> Thanks,
> Bernard
> 
> diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
> --- a/src/hotspot/cpu/x86/x86_64.ad
> +++ b/src/hotspot/cpu/x86/x86_64.ad
> @@ -2069,6 +2069,16 @@
>       }
>     %}
> 
> +  enc_class Log2L(immPow2L imm)
> +  %{
> +    emit_d8(cbuf, log2_long($imm$$constant));
> +  %}
> +
> +  enc_class Log2NotL(immPow2NotL imm)
> +  %{
> +    emit_d8(cbuf, log2_long(~$imm$$constant));
> +  %}
> +
>     enc_class opc2_reg(rRegI dst)
>     %{
>       // BSWAP
> @@ -3131,6 +3141,28 @@
>     interface(CONST_INTER);
>   %}
> 
> +operand immPow2L()
> +%{
> +  // n should be a pure 64-bit power of 2 immediate.
> +  predicate(is_power_of_2_long(n->get_long()) &&
> log2_long(n->get_long()) > 31);
> +  match(ConL);
> +
> +  op_cost(15);
> +  format %{ %}
> +  interface(CONST_INTER);
> +%}
> +
> +operand immPow2NotL()
> +%{
> +  // n should be a pure 64-bit immediate given that not(n) is a power of 2.
> +  predicate(is_power_of_2_long(~n->get_long()) &&
> log2_long(~n->get_long()) > 30);
> +  match(ConL);
> +
> +  op_cost(15);
> +  format %{ %}
> +  interface(CONST_INTER);
> +%}
> +
>   // Long Immediate zero
>   operand immL0()
>   %{
> @@ -9740,6 +9772,19 @@
>     ins_pipe(ialu_mem_imm);
>   %}
> 
> +instruct btrL_mem_imm(memory dst, immPow2NotL src, rFlagsReg cr)
> +%{
> +  match(Set dst (StoreL dst (AndL (LoadL dst) src)));
> +  effect(KILL cr);
> +
> +  ins_cost(125);
> +  format %{ "btrq    $dst, log2(not($src))\t# long" %}
> +  opcode(0x0F, 0xBA, 0x06);
> +  ins_encode(REX_mem_wide(dst), OpcP, OpcS,
> +             RM_opc_mem(tertiary, dst), Log2NotL(src));
> +  ins_pipe(ialu_mem_imm);
> +%}
> +
>   // BMI1 instructions
>   instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2,
> immL_M1 minus_1, rFlagsReg cr) %{
>     match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)));
> @@ -9933,6 +9978,19 @@
>     ins_pipe(ialu_mem_imm);
>   %}
> 
> +instruct btsL_mem_imm(memory dst, immPow2L src, rFlagsReg cr)
> +%{
> +  match(Set dst (StoreL dst (OrL (LoadL dst) src)));
> +  effect(KILL cr);
> +
> +  ins_cost(125);
> +  format %{ "btsq    $dst, log2($src)\t# long" %}
> +  opcode(0x0F, 0xBA, 0x05);
> +  ins_encode(REX_mem_wide(dst), OpcP, OpcS,
> +             RM_opc_mem(tertiary, dst), Log2L(src));
> +  ins_pipe(ialu_mem_imm);
> +%}
> +
>   // Xor Instructions
>   // Xor Register with Register
>   instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
> 


More information about the hotspot-compiler-dev mailing list