JDK-8214239 (?): Missing x86_64.ad patterns for clearing and setting long vector bits
Vladimir Kozlov
vladimir.kozlov at oracle.com
Wed Nov 6 18:44:47 UTC 2019
Hi Bernard,
It is interesting suggestion. I don't see we use BTR and BTS currently. Sandhya, do these instructions has some
limitations/restrictions?
Regarding changes. For new code we prefer to have new encoding of macroasm instructions used in .ad files instead of
opcodes [1]. This way we make sure correct encoding is used on different CPUs.
Thanks,
Vladimir
[1] http://hg.openjdk.java.net/jdk/jdk/file/38d4202154f2/src/hotspot/cpu/x86/x86_64.ad#l10051
On 11/2/19 10:18 AM, B. Blaser wrote:
> Hi,
>
> I experimented, some time ago, with an optimization of several common
> flag patterns (see also JBS) using BTR/BTS instead of AND/OR
> instructions on x86_64 xeon:
>
> @BenchmarkMode(Mode.AverageTime)
> @OutputTimeUnit(TimeUnit.NANOSECONDS)
> @State(Scope.Thread)
> public class BitSetAndReset {
> private static final int COUNT = 10_000;
>
> private static final long MASK63 = 0x8000_0000_0000_0000L;
> private static final long MASK31 = 0x0000_0000_8000_0000L;
> private static final long MASK15 = 0x0000_0000_0000_8000L;
> private static final long MASK00 = 0x0000_0000_0000_0001L;
>
> private long andq, orq;
> private boolean success = true;
>
> @TearDown(Level.Iteration)
> public void finish() {
> if (!success)
> throw new AssertionError("Failure while setting or
> clearing long vector bits!");
> }
>
> @Benchmark
> public void bitSet(Blackhole bh) {
> for (int i=0; i<COUNT; i++) {
> andq = MASK63 | MASK31 | MASK15 | MASK00;
> orq = 0;
> bh.consume(test63());
> bh.consume(test31());
> bh.consume(test15());
> bh.consume(test00());
> success &= andq == 0 && orq == (MASK63 | MASK31 | MASK15 | MASK00);
> }
> }
>
> private long test63() {
> andq &= ~MASK63;
> orq |= MASK63;
> return 0L;
> }
> private long test31() {
> andq &= ~MASK31;
> orq |= MASK31;
> return 0L;
> }
> private long test15() {
> andq &= ~MASK15;
> orq |= MASK15;
> return 0L;
> }
> private long test00() {
> andq &= ~MASK00;
> orq |= MASK00;
> return 0L;
> }
> }
>
> Running the benchmark this way:
>
> $ make test TEST="micro:vm.compiler.BitSetAndReset"
> MICRO="VM_OPTIONS='-XX:CompileCommand=print,org/openjdk/bench/vm/compiler/BitSetAndReset.*test*';FORK=3;WARMUP_ITER=1;ITER=3"
>
> We had before:
>
> 03e movq R10, #9223372036854775807 # long
> 048 andq [RSI + #16 (8-bit)], R10 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 04c movq R10, #-9223372036854775808 # long
> 056 orq [RSI + #24 (8-bit)], R10 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 05a ...
>
> => 28 bytes
>
> 03c xorl RAX, RAX # long
> 03e movq R10, #-2147483649 # long
> 048 andq [RSI + #16 (8-bit)], R10 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 04c movl R10, #2147483648 # long (unsigned 32-bit)
> 052 orq [RSI + #24 (8-bit)], R10 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 056 ...
>
> => 26 bytes
>
> 03c andq [RSI + #16 (8-bit)], #-32769 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 044 orq [RSI + #24 (8-bit)], #32768 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 04c ...
>
> 03c andq [RSI + #16 (8-bit)], #-2 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 041 orq [RSI + #24 (8-bit)], #1 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 046 ...
>
> Benchmark Mode Cnt Score Error Units
> BitSetAndReset.bitSet avgt 9 78083.773 ± 2182.692 ns/op
>
> And we would have after:
>
> 03c btrq [RSI + #16 (8-bit)], log2(not(#9223372036854775807))
> # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 042 btsq [RSI + #24 (8-bit)], log2(#-9223372036854775808)
> # long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 048 ...
>
> => 12 bytes
>
> 03c btrq [RSI + #16 (8-bit)], log2(not(#-2147483649)) #
> long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 042 xorl RAX, RAX # long
> 044 movl R10, #2147483648 # long (unsigned 32-bit)
> 04a orq [RSI + #24 (8-bit)], R10 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 04e ...
>
> => 18 bytes
>
> 03c andq [RSI + #16 (8-bit)], #-32769 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 044 orq [RSI + #24 (8-bit)], #32768 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 04c ...
>
> 03c andq [RSI + #16 (8-bit)], #-2 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.andq
> 041 orq [RSI + #24 (8-bit)], #1 # long ! Field:
> org/openjdk/bench/vm/compiler/BitSetAndReset.orq
> 046 ...
>
> Benchmark Mode Cnt Score Error Units
> BitSetAndReset.bitSet avgt 9 77355.154 ± 252.503 ns/op
>
> We see a tiny performance gain with BTR/BTS but the major interest
> remains the much better encoding with up to 16 bytes saving for pure
> 64-bit immediates along with a lower register consumption.
>
> Does the patch below look reasonable enough to eventually rebase and
> push it to jdk/submit and to post a RFR maybe soon if all goes well?
>
> Thanks,
> Bernard
>
> diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
> --- a/src/hotspot/cpu/x86/x86_64.ad
> +++ b/src/hotspot/cpu/x86/x86_64.ad
> @@ -2069,6 +2069,16 @@
> }
> %}
>
> + enc_class Log2L(immPow2L imm)
> + %{
> + emit_d8(cbuf, log2_long($imm$$constant));
> + %}
> +
> + enc_class Log2NotL(immPow2NotL imm)
> + %{
> + emit_d8(cbuf, log2_long(~$imm$$constant));
> + %}
> +
> enc_class opc2_reg(rRegI dst)
> %{
> // BSWAP
> @@ -3131,6 +3141,28 @@
> interface(CONST_INTER);
> %}
>
> +operand immPow2L()
> +%{
> + // n should be a pure 64-bit power of 2 immediate.
> + predicate(is_power_of_2_long(n->get_long()) &&
> log2_long(n->get_long()) > 31);
> + match(ConL);
> +
> + op_cost(15);
> + format %{ %}
> + interface(CONST_INTER);
> +%}
> +
> +operand immPow2NotL()
> +%{
> + // n should be a pure 64-bit immediate given that not(n) is a power of 2.
> + predicate(is_power_of_2_long(~n->get_long()) &&
> log2_long(~n->get_long()) > 30);
> + match(ConL);
> +
> + op_cost(15);
> + format %{ %}
> + interface(CONST_INTER);
> +%}
> +
> // Long Immediate zero
> operand immL0()
> %{
> @@ -9740,6 +9772,19 @@
> ins_pipe(ialu_mem_imm);
> %}
>
> +instruct btrL_mem_imm(memory dst, immPow2NotL src, rFlagsReg cr)
> +%{
> + match(Set dst (StoreL dst (AndL (LoadL dst) src)));
> + effect(KILL cr);
> +
> + ins_cost(125);
> + format %{ "btrq $dst, log2(not($src))\t# long" %}
> + opcode(0x0F, 0xBA, 0x06);
> + ins_encode(REX_mem_wide(dst), OpcP, OpcS,
> + RM_opc_mem(tertiary, dst), Log2NotL(src));
> + ins_pipe(ialu_mem_imm);
> +%}
> +
> // BMI1 instructions
> instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2,
> immL_M1 minus_1, rFlagsReg cr) %{
> match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)));
> @@ -9933,6 +9978,19 @@
> ins_pipe(ialu_mem_imm);
> %}
>
> +instruct btsL_mem_imm(memory dst, immPow2L src, rFlagsReg cr)
> +%{
> + match(Set dst (StoreL dst (OrL (LoadL dst) src)));
> + effect(KILL cr);
> +
> + ins_cost(125);
> + format %{ "btsq $dst, log2($src)\t# long" %}
> + opcode(0x0F, 0xBA, 0x05);
> + ins_encode(REX_mem_wide(dst), OpcP, OpcS,
> + RM_opc_mem(tertiary, dst), Log2L(src));
> + ins_pipe(ialu_mem_imm);
> +%}
> +
> // Xor Instructions
> // Xor Register with Register
> instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
>
More information about the hotspot-compiler-dev
mailing list