JDK-8214239 (?): Missing x86_64.ad patterns for clearing and setting long vector bits
B. Blaser
bsrbnd at gmail.com
Sat Nov 2 17:18:29 UTC 2019
Hi,
I experimented, some time ago, with an optimization of several common
flag patterns (see also JBS) using BTR/BTS instead of AND/OR
instructions on x86_64 xeon:
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
public class BitSetAndReset {
private static final int COUNT = 10_000;
private static final long MASK63 = 0x8000_0000_0000_0000L;
private static final long MASK31 = 0x0000_0000_8000_0000L;
private static final long MASK15 = 0x0000_0000_0000_8000L;
private static final long MASK00 = 0x0000_0000_0000_0001L;
private long andq, orq;
private boolean success = true;
@TearDown(Level.Iteration)
public void finish() {
if (!success)
throw new AssertionError("Failure while setting or
clearing long vector bits!");
}
@Benchmark
public void bitSet(Blackhole bh) {
for (int i=0; i<COUNT; i++) {
andq = MASK63 | MASK31 | MASK15 | MASK00;
orq = 0;
bh.consume(test63());
bh.consume(test31());
bh.consume(test15());
bh.consume(test00());
success &= andq == 0 && orq == (MASK63 | MASK31 | MASK15 | MASK00);
}
}
private long test63() {
andq &= ~MASK63;
orq |= MASK63;
return 0L;
}
private long test31() {
andq &= ~MASK31;
orq |= MASK31;
return 0L;
}
private long test15() {
andq &= ~MASK15;
orq |= MASK15;
return 0L;
}
private long test00() {
andq &= ~MASK00;
orq |= MASK00;
return 0L;
}
}
Running the benchmark this way:
$ make test TEST="micro:vm.compiler.BitSetAndReset"
MICRO="VM_OPTIONS='-XX:CompileCommand=print,org/openjdk/bench/vm/compiler/BitSetAndReset.*test*';FORK=3;WARMUP_ITER=1;ITER=3"
We had before:
03e movq R10, #9223372036854775807 # long
048 andq [RSI + #16 (8-bit)], R10 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
04c movq R10, #-9223372036854775808 # long
056 orq [RSI + #24 (8-bit)], R10 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
05a ...
=> 28 bytes
03c xorl RAX, RAX # long
03e movq R10, #-2147483649 # long
048 andq [RSI + #16 (8-bit)], R10 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
04c movl R10, #2147483648 # long (unsigned 32-bit)
052 orq [RSI + #24 (8-bit)], R10 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
056 ...
=> 26 bytes
03c andq [RSI + #16 (8-bit)], #-32769 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
044 orq [RSI + #24 (8-bit)], #32768 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
04c ...
03c andq [RSI + #16 (8-bit)], #-2 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
041 orq [RSI + #24 (8-bit)], #1 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
046 ...
Benchmark Mode Cnt Score Error Units
BitSetAndReset.bitSet avgt 9 78083.773 ± 2182.692 ns/op
And we would have after:
03c btrq [RSI + #16 (8-bit)], log2(not(#9223372036854775807))
# long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
042 btsq [RSI + #24 (8-bit)], log2(#-9223372036854775808)
# long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.orq
048 ...
=> 12 bytes
03c btrq [RSI + #16 (8-bit)], log2(not(#-2147483649)) #
long ! Field: org/openjdk/bench/vm/compiler/BitSetAndReset.andq
042 xorl RAX, RAX # long
044 movl R10, #2147483648 # long (unsigned 32-bit)
04a orq [RSI + #24 (8-bit)], R10 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
04e ...
=> 18 bytes
03c andq [RSI + #16 (8-bit)], #-32769 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
044 orq [RSI + #24 (8-bit)], #32768 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
04c ...
03c andq [RSI + #16 (8-bit)], #-2 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.andq
041 orq [RSI + #24 (8-bit)], #1 # long ! Field:
org/openjdk/bench/vm/compiler/BitSetAndReset.orq
046 ...
Benchmark Mode Cnt Score Error Units
BitSetAndReset.bitSet avgt 9 77355.154 ± 252.503 ns/op
We see a tiny performance gain with BTR/BTS but the major interest
remains the much better encoding with up to 16 bytes saving for pure
64-bit immediates along with a lower register consumption.
Does the patch below look reasonable enough to eventually rebase and
push it to jdk/submit and to post a RFR maybe soon if all goes well?
Thanks,
Bernard
diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -2069,6 +2069,16 @@
}
%}
+ enc_class Log2L(immPow2L imm)
+ %{
+ emit_d8(cbuf, log2_long($imm$$constant));
+ %}
+
+ enc_class Log2NotL(immPow2NotL imm)
+ %{
+ emit_d8(cbuf, log2_long(~$imm$$constant));
+ %}
+
enc_class opc2_reg(rRegI dst)
%{
// BSWAP
@@ -3131,6 +3141,28 @@
interface(CONST_INTER);
%}
+operand immPow2L()
+%{
+ // n should be a pure 64-bit power of 2 immediate.
+ predicate(is_power_of_2_long(n->get_long()) &&
log2_long(n->get_long()) > 31);
+ match(ConL);
+
+ op_cost(15);
+ format %{ %}
+ interface(CONST_INTER);
+%}
+
+operand immPow2NotL()
+%{
+ // n should be a pure 64-bit immediate given that not(n) is a power of 2.
+ predicate(is_power_of_2_long(~n->get_long()) &&
log2_long(~n->get_long()) > 30);
+ match(ConL);
+
+ op_cost(15);
+ format %{ %}
+ interface(CONST_INTER);
+%}
+
// Long Immediate zero
operand immL0()
%{
@@ -9740,6 +9772,19 @@
ins_pipe(ialu_mem_imm);
%}
+instruct btrL_mem_imm(memory dst, immPow2NotL src, rFlagsReg cr)
+%{
+ match(Set dst (StoreL dst (AndL (LoadL dst) src)));
+ effect(KILL cr);
+
+ ins_cost(125);
+ format %{ "btrq $dst, log2(not($src))\t# long" %}
+ opcode(0x0F, 0xBA, 0x06);
+ ins_encode(REX_mem_wide(dst), OpcP, OpcS,
+ RM_opc_mem(tertiary, dst), Log2NotL(src));
+ ins_pipe(ialu_mem_imm);
+%}
+
// BMI1 instructions
instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2,
immL_M1 minus_1, rFlagsReg cr) %{
match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)));
@@ -9933,6 +9978,19 @@
ins_pipe(ialu_mem_imm);
%}
+instruct btsL_mem_imm(memory dst, immPow2L src, rFlagsReg cr)
+%{
+ match(Set dst (StoreL dst (OrL (LoadL dst) src)));
+ effect(KILL cr);
+
+ ins_cost(125);
+ format %{ "btsq $dst, log2($src)\t# long" %}
+ opcode(0x0F, 0xBA, 0x05);
+ ins_encode(REX_mem_wide(dst), OpcP, OpcS,
+ RM_opc_mem(tertiary, dst), Log2L(src));
+ ins_pipe(ialu_mem_imm);
+%}
+
// Xor Instructions
// Xor Register with Register
instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
More information about the hotspot-compiler-dev
mailing list