RFR: 8285868: x86_64 intrinsics for floating point methods isNaN, isFinite and isInfinite [v8]
Quan Anh Mai
duke at openjdk.java.net
Sat May 21 09:47:55 UTC 2022
On Sat, 21 May 2022 07:40:20 GMT, Srinivas Vamsi Parasa <duke at openjdk.java.net> wrote:
>> Impressive. Few comments.
>>
>> You are testing performance of storing `boolean` results into array but usually these Java methods used in conditions. Measuring that will be more real word case. For both case: with `avx512dq` On and OFF.
>>
>> And you need to post you perf results at least in RFE. Please, also show what instructions are currently generated vs your changes. I don't get how you made `isNaN()` faster - you generate more instructions is seems.
>>
>> Instead of 3 new Ideal nodes per type you can use one and store instrinsic id (or other enum) in its field which you can read in `.ad` file instructions. Instead I suggest to split those mach instructions based on `avx512dq` support to avoid unused registers killing.
>>
>> Why Double type support is limited to LP64? Why there is no `x86_32.ad` changes?
>>
>> You can reuse `tmp1` in `double_class_check()`.
>
> Hi Vladimir (@vnkozlov)
>
> For 32bit, in the case of double, we see performance improvement using `vfpclasssd` instruction but **without** `vfpclassd`, we see **40% decrease** in performance for `isFinite()` compared to the original Java code. Below, is the code which implements the intrinsic using SSE.
>
> Is it Ok to skip support for **non** `vfpclassd` for 32bit?
>
>
> void C2_MacroAssembler::double_class_check_sse(int opcode, XMMRegister src, Register dst, Register temp, Register temp1) {
> int32_t POS_INF_HI = 0x7ff00000; // hi 32bits
> int32_t KILL_SIGN_MASK_HI = 0x7fffffff; // hi 32 bits
>
> pshuflw(src, src, 0x4e); //switch hi to lo
> movdl(temp, src);
> movl(temp1, KILL_SIGN_MASK_HI);
> andl(temp, temp1);
> movl(temp1, POS_INF_HI);
> cmpl(temp, temp1);
> switch (opcode) {
> case Op_IsFiniteD:
> setb(Assembler::below, dst);
> break;
> case Op_IsInfiniteD:
> setb(Assembler::equal, dst);
> break;
> case Op_IsNaND:
> setb(Assembler::above, dst);
> break;
> default:
> assert(false, "%s", NodeClassNames[opcode]);
> }
> andl(dst, 0xff);
> }
@vamsi-parasa I modified your benchmark to emulate more use cases of these functions and run it on the baseline, #8525 with modified `isInfinite` (to use `Math.abs(v) > MAX_VALUE` instead) and this patch. The result is as follows, the source code and the assembly for the interesting parts will be shown later
Baseline #8459 #8525
Benchmark Mode Cnt Score Error Score Error Score Error Units
FloatClassCheck.testIsFiniteBranch avgt 5 2.522 ± 0.094 2.564 ± 0.187 2.512 ± 0.137 ns/op
FloatClassCheck.testIsFiniteCMov avgt 5 0.479 ± 0.014 0.786 ± 0.009 0.475 ± 0.005 ns/op
FloatClassCheck.testIsFiniteStore avgt 5 0.482 ± 0.010 0.603 ± 0.026 0.480 ± 0.006 ns/op
FloatClassCheck.testIsInfiniteBranch avgt 5 1.921 ± 0.043 1.778 ± 0.023 1.767 ± 0.039 ns/op
FloatClassCheck.testIsInfiniteCMov avgt 5 1.124 ± 0.045 0.787 ± 0.013 0.622 ± 0.019 ns/op
FloatClassCheck.testIsInfiniteStore avgt 5 1.195 ± 0.033 0.602 ± 0.015 0.625 ± 0.033 ns/op
FloatClassCheck.testIsNaNBranch avgt 5 1.896 ± 0.182 2.097 ± 0.216 1.725 ± 0.222 ns/op
FloatClassCheck.testIsNaNCMov avgt 5 2.956 ± 0.021 0.856 ± 0.003 0.390 ± 0.006 ns/op
FloatClassCheck.testIsNaNStore avgt 5 3.024 ± 0.071 0.741 ± 0.139 0.410 ± 0.008 ns/op
Baseline #8459 #8525
Benchmark Mode Cnt Score Error Score Error Score Error Units
DoubleClassCheck.testIsFiniteBranch avgt 5 2.566 ± 0.105 3.023 ± 0.117 2.603 ± 0.137 ns/op
DoubleClassCheck.testIsFiniteCMov avgt 5 0.481 ± 0.010 0.978 ± 0.011 0.485 ± 0.018 ns/op
DoubleClassCheck.testIsFiniteStore avgt 5 0.480 ± 0.012 0.943 ± 0.012 0.486 ± 0.011 ns/op
DoubleClassCheck.testIsInfiniteBranch avgt 5 1.907 ± 0.081 1.917 ± 0.065 1.808 ± 0.039 ns/op
DoubleClassCheck.testIsInfiniteCMov avgt 5 1.111 ± 0.028 0.982 ± 0.019 0.630 ± 0.017 ns/op
DoubleClassCheck.testIsInfiniteStore avgt 5 1.134 ± 0.011 0.944 ± 0.017 0.630 ± 0.009 ns/op
DoubleClassCheck.testIsNaNBranch avgt 5 1.926 ± 0.218 2.193 ± 0.045 1.767 ± 0.142 ns/op
DoubleClassCheck.testIsNaNCMov avgt 5 2.944 ± 0.020 1.047 ± 0.012 0.392 ± 0.009 ns/op
DoubleClassCheck.testIsNaNStore avgt 5 3.011 ± 0.065 0.946 ± 0.029 0.411 ± 0.004 ns/op
The source code for `FloatClassCheck`, that of `DoubleClassCheck` is similar
RandomGenerator rng;
static final int BUFFER_SIZE = 1024;
float[] inputs;
boolean[] storeOutputs;
int[] cmovOutputs;
int[] branchOutputs;
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
static int call() {
return 1;
}
@Setup
public void setup() {
storeOutputs = new boolean[BUFFER_SIZE];
cmovOutputs = new int[BUFFER_SIZE];
branchOutputs = new int[BUFFER_SIZE];
inputs = new float[BUFFER_SIZE];
RandomGenerator rng = RandomGeneratorFactory.getDefault().create(0);
float input;
for (int i = 0; i < BUFFER_SIZE; i++) {
if (i % 5 == 0) {
input = (i%2 == 0) ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
}
else if (i % 3 == 0) input = Float.NaN;
else input = rng.nextFloat();
inputs[i] = input;
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsFiniteStore() {
for (int i = 0; i < BUFFER_SIZE; i++) {
storeOutputs[i] = Float.isFinite(inputs[i]);
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsInfiniteStore() {
for (int i = 0; i < BUFFER_SIZE; i++) {
storeOutputs[i] = Float.isInfinite(inputs[i]);
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsNaNStore() {
for (int i = 0; i < BUFFER_SIZE; i++) {
storeOutputs[i] = Float.isNaN(inputs[i]);
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsFiniteCMov() {
for (int i = 0; i < BUFFER_SIZE; i++) {
cmovOutputs[i] = Float.isFinite(inputs[i]) ? 9 : 7;
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsInfiniteCMov() {
for (int i = 0; i < BUFFER_SIZE; i++) {
cmovOutputs[i] = Float.isInfinite(inputs[i]) ? 9 : 7;
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsNaNCMov() {
for (int i = 0; i < BUFFER_SIZE; i++) {
cmovOutputs[i] = Float.isNaN(inputs[i]) ? 9 : 7;
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsFiniteBranch() {
for (int i = 0; i < BUFFER_SIZE; i++) {
cmovOutputs[i] = Float.isFinite(inputs[i]) ? call() : 7;
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsInfiniteBranch() {
for (int i = 0; i < BUFFER_SIZE; i++) {
cmovOutputs[i] = Float.isInfinite(inputs[i]) ? call() : 7;
}
}
@Benchmark
@OperationsPerInvocation(BUFFER_SIZE)
public void testIsNaNBranch() {
for (int i = 0; i < BUFFER_SIZE; i++) {
cmovOutputs[i] = Float.isNaN(inputs[i]) ? call() : 7;
}
}
The assembly of the interesting parts of the executions:
FloatClassCheck::testIsFiniteBranch:
Baseline, #8525:
vandps -0xd4791(%rip), %xmm0, %xmm1
vucomiss %xmm1, %xmm2
jae -0x77
#8459
vmovd %xmm0, %r11d
andl $0x7fffffff, %r11d
cmpl $0x7f800000, %r11d
setb %r10b
andl $0xff, %r10d
testl %r10d, %r10d
jne -0x90
FloatClassCheck::testIsFiniteCMov:
Baseline, #8525:
vandps -0xcdcf6(%rip), %xmm6, %xmm7
vucomiss %xmm7, %xmm10
movl $0x9, %r10d
cmovbl %r14d, %r10d
#8459:
vmovd %xmm4, %r9d
andl $0x7fffffff, %r9d
cmpl $0x7f800000, %r9d
setb %r8b
andl $0xff, %r8d
testl %r8d, %r8d
movl $0x7, %ebx
cmovnel %r14d, %ebx
FloatClassCheck::isFiniteStore:
Baseline, #8525:
vandps -0xcfd74(%rip), %xmm3, %xmm3
movl $0x1, %r10d
vucomiss %xmm3, %xmm0
cmovbl %r9d, %r10d
#8459:
vmovd %xmm6, %edi
andl $0x7fffffff, %edi
cmpl $0x7f800000, %edi
setb %dil
andl $0xff, %edi
FloatClassCheck::isInfiniteBranch:
Baseline:
vucomiss -0xc8(%rip), %xmm1
jp 0x2
je 0x20
vucomiss -0xd0(%rip), %xmm1
nopl (%rax,%rax)
nop
jp -0x86
jne -0x8c
#8459:
vmovd %xmm1, %r10d
andl $0x7fffffff, %r10d
cmpl $0x7f800000, %r10d
sete %r11b
andl $0xff, %r11d
testl %r11d, %r11d
je -0x87
#8525:
vandps -0xce478(%rip), %xmm1, %xmm0
nopl (%rax,%rax)
vucomiss -0xc8(%rip), %xmm0
jbe -0x76
FloatClassCheck::isInfiniteCMov:
Baseline:
vucomiss -0x128(%rip), %xmm1
jp 0x2
je 0x16
vucomiss -0x130(%rip), %xmm1
jp 0x2
je 0xa
#8459:
vmovd %xmm5, %eax
andl $0x7fffffff, %eax
cmpl $0x7f800000, %eax
sete %bpl
andl $0xff, %ebp
testl %ebp, %ebp
movl $0x7, %eax
cmovnel %ebx, %eax
#8525:
vandps -0xcefc3(%rip), %xmm0, %xmm0
vucomiss -0x12b(%rip), %xmm0
movl $0x9, %esi
cmovbel %r8d, %esi
FloatClassCheck::isInfiniteStore:
Baseline:
vucomiss -0x128(%rip), %xmm0
jp 0x2
je 0x11
vucomiss -0x130(%rip), %xmm0
jp 0x2
je 0x5
#8459:
vmovd %xmm2, %r8d
andl $0x7fffffff, %r8d
cmpl $0x7f800000, %r8d
sete %r8b
andl $0xff, %r8d
#8525:
vandps -0xcf2b9(%rip), %xmm0, %xmm0
vucomiss -0x121(%rip), %xmm0
movl $0x1, %r11d
cmovbel %esi, %r11d
FloatClassCheck::isNaNBranch:
Baseline:
vucomiss %xmm0, %xmm0
jp 0x2
je -0x64
#8459:
vmovd %xmm1, %r10d
andl $0x7fffffff, %r10d
cmpl $0x7f800000, %r10d
seta %r11b
andl $0xff, %r11d
testl %r11d, %r11d
je -0x87
#8525:
vucomiss %xmm1, %xmm1
jnp -0x62
FloatClassCheck::isNaNCMov:
Baseline:
vucomiss %xmm5, %xmm5
jnp 0xa
pushfq
andq $-0xd5, (%rsp)
popfq
movl $0x7, %r9d
cmovnel %r8d, %r9d
#8459:
vmovd %xmm4, %ebp
andl $0x7fffffff, %ebp
cmpl $0x7f800000, %ebp
seta %al
andl $0xff, %eax
testl %eax, %eax
movl $0x7, %ebp
cmovnel %ebx, %ebp
#8525:
vucomiss %xmm4, %xmm4
movl $0x7, %r9d
cmovpl %r8d, %r9d
FloatClassCheck::isNaNStore:
Baseline:
vucomiss %xmm3, %xmm3
jnp 0xa
pushfq
andq $-0xd5, (%rsp)
popfq
movl $0x1, %ebx
cmovel %eax, %ebx
#8459:
vmovd %xmm6, %edi
andl $0x7fffffff, %edi
cmpl $0x7f800000, %edi
seta %dil
andl $0xff, %edi
#8525:
movl $0x1, %r9d
vucomiss %xmm0, %xmm0
cmovnpl %r10d, %r9d
The assembly output for `DoubleClassCheck` is similar. Thanks.
-------------
PR: https://git.openjdk.java.net/jdk/pull/8459
More information about the hotspot-compiler-dev
mailing list