JDK-8214239 (?): Missing x86_64.ad patterns for clearing and setting long vector bits

B. Blaser bsrbnd at gmail.com
Sat Nov 2 17:18:29 UTC 2019


Hi,

I experimented, some time ago, with an optimization of several common
flag patterns (see also JBS) using BTR/BTS instead of AND/OR
instructions on x86_64 xeon:

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
public class BitSetAndReset {
    private static final int COUNT = 10_000;

    private static final long MASK63 = 0x8000_0000_0000_0000L;
    private static final long MASK31 = 0x0000_0000_8000_0000L;
    private static final long MASK15 = 0x0000_0000_0000_8000L;
    private static final long MASK00 = 0x0000_0000_0000_0001L;

    private long andq, orq;
    private boolean success = true;

    @TearDown(Level.Iteration)
    public void finish() {
        if (!success)
            throw new AssertionError("Failure while setting or
clearing long vector bits!");
    }

    @Benchmark
    public void bitSet(Blackhole bh) {
        for (int i=0; i<COUNT; i++) {
            andq = MASK63 | MASK31 | MASK15 | MASK00;
            orq = 0;
            bh.consume(test63());
            bh.consume(test31());
            bh.consume(test15());
            bh.consume(test00());
            success &= andq == 0 && orq == (MASK63 | MASK31 | MASK15 | MASK00);
        }
    }

    private long test63() {
        andq &= ~MASK63;
        orq |= MASK63;
        return 0L;
    }
    private long test31() {
        andq &= ~MASK31;
        orq |= MASK31;
        return 0L;
    }
    private long test15() {
        andq &= ~MASK15;
        orq |= MASK15;
        return 0L;
    }
    private long test00() {
        andq &= ~MASK00;
        orq |= MASK00;
        return 0L;
    }
}

Running the benchmark this way:

$ make test TEST="micro:vm.compiler.BitSetAndReset"
MICRO="VM_OPTIONS='-XX:CompileCommand=print,org/openjdk/bench/vm/compiler/BitSetAndReset.*test*';FORK=3;WARMUP_ITER=1;ITER=3"

We had before:

03e       movq    R10, #9223372036854775807    # long
048       andq    [RSI + #16 (8-bit)], R10    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
04c       movq    R10, #-9223372036854775808    # long
056       orq     [RSI + #24 (8-bit)], R10    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
05a       ...

=> 28 bytes

03c       xorl    RAX, RAX    # long
03e       movq    R10, #-2147483649    # long
048       andq    [RSI + #16 (8-bit)], R10    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
04c       movl    R10, #2147483648    # long (unsigned 32-bit)
052       orq     [RSI + #24 (8-bit)], R10    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
056       ...

=> 26 bytes

03c       andq    [RSI + #16 (8-bit)], #-32769    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
044       orq     [RSI + #24 (8-bit)], #32768    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
04c       ...

03c       andq    [RSI + #16 (8-bit)], #-2    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
041       orq     [RSI + #24 (8-bit)], #1    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
046       ...

Benchmark              Mode  Cnt      Score      Error  Units
BitSetAndReset.bitSet  avgt    9  78083.773 ± 2182.692  ns/op

And we would have after:

03c       btrq    [RSI + #16 (8-bit)], log2(not(#9223372036854775807))
   # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
042       btsq    [RSI + #24 (8-bit)], log2(#-9223372036854775808)
# long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.orq
048       ...

=> 12 bytes

03c       btrq    [RSI + #16 (8-bit)], log2(not(#-2147483649))    #
long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
042       xorl    RAX, RAX    # long
044       movl    R10, #2147483648    # long (unsigned 32-bit)
04a       orq     [RSI + #24 (8-bit)], R10    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
04e       ...

=> 18 bytes

03c       andq    [RSI + #16 (8-bit)], #-32769    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
044       orq     [RSI + #24 (8-bit)], #32768    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
04c       ...

03c       andq    [RSI + #16 (8-bit)], #-2    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
041       orq     [RSI + #24 (8-bit)], #1    # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
046       ...

Benchmark              Mode  Cnt      Score     Error  Units
BitSetAndReset.bitSet  avgt    9  77355.154 ± 252.503  ns/op

We see a tiny performance gain with BTR/BTS but the major interest
remains the much better encoding with up to 16 bytes saving for pure
64-bit immediates along with a lower register consumption.

Does the patch below look reasonable enough to eventually rebase and
push it to jdk/submit and to post a RFR maybe soon if all goes well?

Thanks,
Bernard

diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -2069,6 +2069,16 @@
     }
   %}

+  enc_class Log2L(immPow2L imm)
+  %{
+    emit_d8(cbuf, log2_long($imm$$constant));
+  %}
+
+  enc_class Log2NotL(immPow2NotL imm)
+  %{
+    emit_d8(cbuf, log2_long(~$imm$$constant));
+  %}
+
   enc_class opc2_reg(rRegI dst)
   %{
     // BSWAP
@@ -3131,6 +3141,28 @@
   interface(CONST_INTER);
 %}

+operand immPow2L()
+%{
+  // n should be a pure 64-bit power of 2 immediate.
+  predicate(is_power_of_2_long(n->get_long()) &&
log2_long(n->get_long()) > 31);
+  match(ConL);
+
+  op_cost(15);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immPow2NotL()
+%{
+  // n should be a pure 64-bit immediate given that not(n) is a power of 2.
+  predicate(is_power_of_2_long(~n->get_long()) &&
log2_long(~n->get_long()) > 30);
+  match(ConL);
+
+  op_cost(15);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Long Immediate zero
 operand immL0()
 %{
@@ -9740,6 +9772,19 @@
   ins_pipe(ialu_mem_imm);
 %}

+instruct btrL_mem_imm(memory dst, immPow2NotL src, rFlagsReg cr)
+%{
+  match(Set dst (StoreL dst (AndL (LoadL dst) src)));
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "btrq    $dst, log2(not($src))\t# long" %}
+  opcode(0x0F, 0xBA, 0x06);
+  ins_encode(REX_mem_wide(dst), OpcP, OpcS,
+             RM_opc_mem(tertiary, dst), Log2NotL(src));
+  ins_pipe(ialu_mem_imm);
+%}
+
 // BMI1 instructions
 instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2,
immL_M1 minus_1, rFlagsReg cr) %{
   match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)));
@@ -9933,6 +9978,19 @@
   ins_pipe(ialu_mem_imm);
 %}

+instruct btsL_mem_imm(memory dst, immPow2L src, rFlagsReg cr)
+%{
+  match(Set dst (StoreL dst (OrL (LoadL dst) src)));
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "btsq    $dst, log2($src)\t# long" %}
+  opcode(0x0F, 0xBA, 0x05);
+  ins_encode(REX_mem_wide(dst), OpcP, OpcS,
+             RM_opc_mem(tertiary, dst), Log2L(src));
+  ins_pipe(ialu_mem_imm);
+%}
+
 // Xor Instructions
 // Xor Register with Register
 instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr)


More information about the hotspot-compiler-dev mailing list