JDK-8214239 (?): Missing x86_64.ad patterns for clearing and setting long vector bits

Viswanathan, Sandhya sandhya.viswanathan at intel.com
Thu Nov 7 00:34:32 UTC 2019


Hi Vladimir/Bernard,



I don’t see any restrictions/limitations on these instructions other than the fact that the “long” operation is only supported on 64-bit format as usual so should be restricted to 64-bit JVM only.

The code size improvement that Bernard demonstrates is significant for operation on longs.

It looks like the throughput for AND/OR is better than BTR/BTS  (0.25 vs 0.5) though. Please refer Table C-17 in the document below:

https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf

Best Regards,

Sandhya



-----Original Message-----
From: Vladimir Kozlov <vladimir.kozlov at oracle.com>
Sent: Wednesday, November 06, 2019 10:45 AM
To: B. Blaser <bsrbnd at gmail.com>; hotspot-compiler-dev at openjdk.java.net
Cc: Viswanathan, Sandhya <sandhya.viswanathan at intel.com>
Subject: Re: JDK-8214239 (?): Missing x86_64.ad patterns for clearing and setting long vector bits



Hi Bernard,



It is interesting suggestion. I don't see we use BTR and BTS currently. Sandhya, do these instructions has some limitations/restrictions?



Regarding changes. For new code we prefer to have new encoding of macroasm instructions used in .ad files instead of opcodes [1]. This way we make sure correct encoding is used on different CPUs.



Thanks,

Vladimir



[1] http://hg.openjdk.java.net/jdk/jdk/file/38d4202154f2/src/hotspot/cpu/x86/x86_64.ad#l10051



On 11/2/19 10:18 AM, B. Blaser wrote:

> Hi,

>

> I experimented, some time ago, with an optimization of several common

> flag patterns (see also JBS) using BTR/BTS instead of AND/OR

> instructions on x86_64 xeon:

>

> @BenchmarkMode(Mode.AverageTime)

> @OutputTimeUnit(TimeUnit.NANOSECONDS)

> @State(Scope.Thread)

> public class BitSetAndReset {

>      private static final int COUNT = 10_000;

>

>      private static final long MASK63 = 0x8000_0000_0000_0000L;

>      private static final long MASK31 = 0x0000_0000_8000_0000L;

>      private static final long MASK15 = 0x0000_0000_0000_8000L;

>      private static final long MASK00 = 0x0000_0000_0000_0001L;

>

>      private long andq, orq;

>      private boolean success = true;

>

>      @TearDown(Level.Iteration)

>      public void finish() {

>          if (!success)

>              throw new AssertionError("Failure while setting or

> clearing long vector bits!");

>      }

>

>      @Benchmark

>      public void bitSet(Blackhole bh) {

>          for (int i=0; i<COUNT; i++) {

>              andq = MASK63 | MASK31 | MASK15 | MASK00;

>              orq = 0;

>              bh.consume(test63());

>              bh.consume(test31());

>              bh.consume(test15());

>              bh.consume(test00());

>              success &= andq == 0 && orq == (MASK63 | MASK31 | MASK15 | MASK00);

>          }

>      }

>

>      private long test63() {

>          andq &= ~MASK63;

>          orq |= MASK63;

>          return 0L;

>      }

>      private long test31() {

>          andq &= ~MASK31;

>          orq |= MASK31;

>          return 0L;

>      }

>      private long test15() {

>          andq &= ~MASK15;

>          orq |= MASK15;

>          return 0L;

>      }

>      private long test00() {

>          andq &= ~MASK00;

>          orq |= MASK00;

>          return 0L;

>      }

> }

>

> Running the benchmark this way:

>

> $ make test TEST="micro:vm.compiler.BitSetAndReset"

> MICRO="VM_OPTIONS='-XX:CompileCommand=print,org/openjdk/bench/vm/compiler/BitSetAndReset.*test*';FORK=3;WARMUP_ITER=1;ITER=3"

>

> We had before:

>

> 03e       movq    R10, #9223372036854775807    # long

> 048       andq    [RSI + #16 (8-bit)], R10    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 04c       movq    R10, #-9223372036854775808    # long

> 056       orq     [RSI + #24 (8-bit)], R10    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 05a       ...

>

> => 28 bytes

>

> 03c       xorl    RAX, RAX    # long

> 03e       movq    R10, #-2147483649    # long

> 048       andq    [RSI + #16 (8-bit)], R10    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 04c       movl    R10, #2147483648    # long (unsigned 32-bit)

> 052       orq     [RSI + #24 (8-bit)], R10    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 056       ...

>

> => 26 bytes

>

> 03c       andq    [RSI + #16 (8-bit)], #-32769    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 044       orq     [RSI + #24 (8-bit)], #32768    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 04c       ...

>

> 03c       andq    [RSI + #16 (8-bit)], #-2    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 041       orq     [RSI + #24 (8-bit)], #1    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 046       ...

>

> Benchmark              Mode  Cnt      Score      Error  Units

> BitSetAndReset.bitSet  avgt    9  78083.773 ± 2182.692  ns/op

>

> And we would have after:

>

> 03c       btrq    [RSI + #16 (8-bit)], log2(not(#9223372036854775807))

>     # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 042       btsq    [RSI + #24 (8-bit)], log2(#-9223372036854775808)

> # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 048       ...

>

> => 12 bytes

>

> 03c       btrq    [RSI + #16 (8-bit)], log2(not(#-2147483649))    #

> long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 042       xorl    RAX, RAX    # long

> 044       movl    R10, #2147483648    # long (unsigned 32-bit)

> 04a       orq     [RSI + #24 (8-bit)], R10    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 04e       ...

>

> => 18 bytes

>

> 03c       andq    [RSI + #16 (8-bit)], #-32769    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 044       orq     [RSI + #24 (8-bit)], #32768    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 04c       ...

>

> 03c       andq    [RSI + #16 (8-bit)], #-2    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.andq

> 041       orq     [RSI + #24 (8-bit)], #1    # long ! Field:

> org/openjdk/bench/vm/compiler/BitSetAndReset.orq

> 046       ...

>

> Benchmark              Mode  Cnt      Score     Error  Units

> BitSetAndReset.bitSet  avgt    9  77355.154 ± 252.503  ns/op

>

> We see a tiny performance gain with BTR/BTS but the major interest

> remains the much better encoding with up to 16 bytes saving for pure

> 64-bit immediates along with a lower register consumption.

>

> Does the patch below look reasonable enough to eventually rebase and

> push it to jdk/submit and to post a RFR maybe soon if all goes well?

>

> Thanks,

> Bernard

>

> diff --git a/src/hotspot/cpu/x86/x86_64.ad

> b/src/hotspot/cpu/x86/x86_64.ad

> --- a/src/hotspot/cpu/x86/x86_64.ad

> +++ b/src/hotspot/cpu/x86/x86_64.ad

> @@ -2069,6 +2069,16 @@

>       }

>     %}

>

> +  enc_class Log2L(immPow2L imm)

> +  %{

> +    emit_d8(cbuf, log2_long($imm$$constant));  %}

> +

> +  enc_class Log2NotL(immPow2NotL imm)  %{

> +    emit_d8(cbuf, log2_long(~$imm$$constant));  %}

> +

>     enc_class opc2_reg(rRegI dst)

>     %{

>       // BSWAP

> @@ -3131,6 +3141,28 @@

>     interface(CONST_INTER);

>   %}

>

> +operand immPow2L()

> +%{

> +  // n should be a pure 64-bit power of 2 immediate.

> +  predicate(is_power_of_2_long(n->get_long()) &&

> log2_long(n->get_long()) > 31);

> +  match(ConL);

> +

> +  op_cost(15);

> +  format %{ %}

> +  interface(CONST_INTER);

> +%}

> +

> +operand immPow2NotL()

> +%{

> +  // n should be a pure 64-bit immediate given that not(n) is a power of 2.

> +  predicate(is_power_of_2_long(~n->get_long()) &&

> log2_long(~n->get_long()) > 30);

> +  match(ConL);

> +

> +  op_cost(15);

> +  format %{ %}

> +  interface(CONST_INTER);

> +%}

> +

>   // Long Immediate zero

>   operand immL0()

>   %{

> @@ -9740,6 +9772,19 @@

>     ins_pipe(ialu_mem_imm);

>   %}

>

> +instruct btrL_mem_imm(memory dst, immPow2NotL src, rFlagsReg cr) %{

> +  match(Set dst (StoreL dst (AndL (LoadL dst) src)));

> +  effect(KILL cr);

> +

> +  ins_cost(125);

> +  format %{ "btrq    $dst, log2(not($src))\t# long" %}

> +  opcode(0x0F, 0xBA, 0x06);

> +  ins_encode(REX_mem_wide(dst), OpcP, OpcS,

> +             RM_opc_mem(tertiary, dst), Log2NotL(src));

> +  ins_pipe(ialu_mem_imm);

> +%}

> +

>   // BMI1 instructions

>   instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2,

> immL_M1 minus_1, rFlagsReg cr) %{

>     match(Set dst (AndL (XorL src1 minus_1) (LoadL src2))); @@ -9933,6

> +9978,19 @@

>     ins_pipe(ialu_mem_imm);

>   %}

>

> +instruct btsL_mem_imm(memory dst, immPow2L src, rFlagsReg cr) %{

> +  match(Set dst (StoreL dst (OrL (LoadL dst) src)));

> +  effect(KILL cr);

> +

> +  ins_cost(125);

> +  format %{ "btsq    $dst, log2($src)\t# long" %}

> +  opcode(0x0F, 0xBA, 0x05);

> +  ins_encode(REX_mem_wide(dst), OpcP, OpcS,

> +             RM_opc_mem(tertiary, dst), Log2L(src));

> +  ins_pipe(ialu_mem_imm);

> +%}

> +

>   // Xor Instructions

>   // Xor Register with Register

>   instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr)

>


More information about the hotspot-compiler-dev mailing list