RFR: 8285868: x86_64 intrinsics for floating point methods isNaN, isFinite and isInfinite [v8]
Srinivas Vamsi Parasa
duke at openjdk.java.net
Mon May 23 05:15:52 UTC 2022
On Mon, 23 May 2022 04:52:41 GMT, Srinivas Vamsi Parasa <duke at openjdk.java.net> wrote:
>>> For 32bit, in the case of double, we see performance improvement using `vfpclasssd` instruction but **without** `vfpclassd`, we see **40% decrease** in performance for `isFinite()` compared to the original Java code. Below, is the code which implements the intrinsic using SSE.
>>>
>>> Is it Ok to skip support for **non** `vfpclassd` for 32bit?
>>
>> Yes, but add comment about that. Also for 32-bit you need to check SSE2 support which is required by `pshuflw`.
>
>> > For 32bit, in the case of double, we see performance improvement using `vfpclasssd` instruction but **without** `vfpclassd`, we see **40% decrease** in performance for `isFinite()` compared to the original Java code. Below, is the code which implements the intrinsic using SSE.
>> > Is it Ok to skip support for **non** `vfpclassd` for 32bit?
>>
>> Yes, but add comment about that. Also for 32-bit you need to check SSE2 support which is required by `pshuflw`.
>
> Thanks Vladimir! Will add a comment that the intrinsic doesn't give speedup without `vfpclasssd`.
> Yes, the check for `predicate(UseSSE>=2)` was added in the macro shown below.
>
> instruct DoubleClassCheck_reg_reg_sse(rRegI dst, regD src, rRegI tmp, rRegI tmp1, rFlagsReg cr)
> %{
> predicate(UseSSE>=2);
> match(Set dst (IsInfiniteD src));
> match(Set dst (IsNaND src));
> match(Set dst (IsFiniteD src));
> effect(TEMP tmp, TEMP tmp1, KILL cr);
> format %{ "double_class_check $dst, $src" %}
> ins_encode %{
> int opcode = this->ideal_Opcode();
> __ double_class_check_sse(opcode, $src$$XMMRegister, $dst$$Register, $tmp$$Register,
> $tmp1$$Register);
> %}
> ins_pipe(pipe_slow);
> %}
> @vamsi-parasa I modified your benchmark to emulate more use cases of these functions and run it on the baseline, #8525 with modified `isInfinite` (to use `Math.abs(v) > MAX_VALUE` instead) and this patch. The result is as follows, the source code and the assembly for the interesting parts will be shown later
>
> ```
> Baseline #8459 #8525
> Benchmark Mode Cnt Score Error Score Error Score Error Units
> FloatClassCheck.testIsFiniteBranch avgt 5 2.522 ± 0.094 2.564 ± 0.187 2.512 ± 0.137 ns/op
> FloatClassCheck.testIsFiniteCMov avgt 5 0.479 ± 0.014 0.786 ± 0.009 0.475 ± 0.005 ns/op
> FloatClassCheck.testIsFiniteStore avgt 5 0.482 ± 0.010 0.603 ± 0.026 0.480 ± 0.006 ns/op
> FloatClassCheck.testIsInfiniteBranch avgt 5 1.921 ± 0.043 1.778 ± 0.023 1.767 ± 0.039 ns/op
> FloatClassCheck.testIsInfiniteCMov avgt 5 1.124 ± 0.045 0.787 ± 0.013 0.622 ± 0.019 ns/op
> FloatClassCheck.testIsInfiniteStore avgt 5 1.195 ± 0.033 0.602 ± 0.015 0.625 ± 0.033 ns/op
> FloatClassCheck.testIsNaNBranch avgt 5 1.896 ± 0.182 2.097 ± 0.216 1.725 ± 0.222 ns/op
> FloatClassCheck.testIsNaNCMov avgt 5 2.956 ± 0.021 0.856 ± 0.003 0.390 ± 0.006 ns/op
> FloatClassCheck.testIsNaNStore avgt 5 3.024 ± 0.071 0.741 ± 0.139 0.410 ± 0.008 ns/op
>
> Baseline #8459 #8525
> Benchmark Mode Cnt Score Error Score Error Score Error Units
> DoubleClassCheck.testIsFiniteBranch avgt 5 2.566 ± 0.105 3.023 ± 0.117 2.603 ± 0.137 ns/op
> DoubleClassCheck.testIsFiniteCMov avgt 5 0.481 ± 0.010 0.978 ± 0.011 0.485 ± 0.018 ns/op
> DoubleClassCheck.testIsFiniteStore avgt 5 0.480 ± 0.012 0.943 ± 0.012 0.486 ± 0.011 ns/op
> DoubleClassCheck.testIsInfiniteBranch avgt 5 1.907 ± 0.081 1.917 ± 0.065 1.808 ± 0.039 ns/op
> DoubleClassCheck.testIsInfiniteCMov avgt 5 1.111 ± 0.028 0.982 ± 0.019 0.630 ± 0.017 ns/op
> DoubleClassCheck.testIsInfiniteStore avgt 5 1.134 ± 0.011 0.944 ± 0.017 0.630 ± 0.009 ns/op
> DoubleClassCheck.testIsNaNBranch avgt 5 1.926 ± 0.218 2.193 ± 0.045 1.767 ± 0.142 ns/op
> DoubleClassCheck.testIsNaNCMov avgt 5 2.944 ± 0.020 1.047 ± 0.012 0.392 ± 0.009 ns/op
> DoubleClassCheck.testIsNaNStore avgt 5 3.011 ± 0.065 0.946 ± 0.029 0.411 ± 0.004 ns/op
> ```
>
> The source code for `FloatClassCheck`, that of `DoubleClassCheck` is similar
>
> ```
> RandomGenerator rng;
> static final int BUFFER_SIZE = 1024;
> float[] inputs;
> boolean[] storeOutputs;
> int[] cmovOutputs;
> int[] branchOutputs;
>
> @CompilerControl(CompilerControl.Mode.DONT_INLINE)
> static int call() {
> return 1;
> }
>
> @Setup
> public void setup() {
> storeOutputs = new boolean[BUFFER_SIZE];
> cmovOutputs = new int[BUFFER_SIZE];
> branchOutputs = new int[BUFFER_SIZE];
> inputs = new float[BUFFER_SIZE];
> RandomGenerator rng = RandomGeneratorFactory.getDefault().create(0);
> float input;
> for (int i = 0; i < BUFFER_SIZE; i++) {
> if (i % 5 == 0) {
> input = (i%2 == 0) ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
> }
> else if (i % 3 == 0) input = Float.NaN;
> else input = rng.nextFloat();
> inputs[i] = input;
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsFiniteStore() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> storeOutputs[i] = Float.isFinite(inputs[i]);
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsInfiniteStore() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> storeOutputs[i] = Float.isInfinite(inputs[i]);
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsNaNStore() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> storeOutputs[i] = Float.isNaN(inputs[i]);
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsFiniteCMov() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> cmovOutputs[i] = Float.isFinite(inputs[i]) ? 9 : 7;
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsInfiniteCMov() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> cmovOutputs[i] = Float.isInfinite(inputs[i]) ? 9 : 7;
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsNaNCMov() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> cmovOutputs[i] = Float.isNaN(inputs[i]) ? 9 : 7;
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsFiniteBranch() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> cmovOutputs[i] = Float.isFinite(inputs[i]) ? call() : 7;
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsInfiniteBranch() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> cmovOutputs[i] = Float.isInfinite(inputs[i]) ? call() : 7;
> }
> }
>
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsNaNBranch() {
> for (int i = 0; i < BUFFER_SIZE; i++) {
> cmovOutputs[i] = Float.isNaN(inputs[i]) ? call() : 7;
> }
> }
> ```
>
> The assembly of the interesting parts of the executions:
>
> ```
> FloatClassCheck::testIsFiniteBranch:
> Baseline, #8525:
> vandps -0xd4791(%rip), %xmm0, %xmm1
> vucomiss %xmm1, %xmm2
> jae -0x77
>
> #8459
> vmovd %xmm0, %r11d
> andl $0x7fffffff, %r11d
> cmpl $0x7f800000, %r11d
> setb %r10b
> andl $0xff, %r10d
> testl %r10d, %r10d
> jne -0x90
>
> FloatClassCheck::testIsFiniteCMov:
> Baseline, #8525:
> vandps -0xcdcf6(%rip), %xmm6, %xmm7
> vucomiss %xmm7, %xmm10
> movl $0x9, %r10d
> cmovbl %r14d, %r10d
>
> #8459:
> vmovd %xmm4, %r9d
> andl $0x7fffffff, %r9d
> cmpl $0x7f800000, %r9d
> setb %r8b
> andl $0xff, %r8d
> testl %r8d, %r8d
> movl $0x7, %ebx
> cmovnel %r14d, %ebx
>
> FloatClassCheck::isFiniteStore:
> Baseline, #8525:
> vandps -0xcfd74(%rip), %xmm3, %xmm3
> movl $0x1, %r10d
> vucomiss %xmm3, %xmm0
> cmovbl %r9d, %r10d
>
> #8459:
> vmovd %xmm6, %edi
> andl $0x7fffffff, %edi
> cmpl $0x7f800000, %edi
> setb %dil
> andl $0xff, %edi
>
> FloatClassCheck::isInfiniteBranch:
> Baseline:
> vucomiss -0xc8(%rip), %xmm1
> jp 0x2
> je 0x20
> vucomiss -0xd0(%rip), %xmm1
> nopl (%rax,%rax)
> nop
> jp -0x86
> jne -0x8c
>
> #8459:
> vmovd %xmm1, %r10d
> andl $0x7fffffff, %r10d
> cmpl $0x7f800000, %r10d
> sete %r11b
> andl $0xff, %r11d
> testl %r11d, %r11d
> je -0x87
>
> #8525:
> vandps -0xce478(%rip), %xmm1, %xmm0
> nopl (%rax,%rax)
> vucomiss -0xc8(%rip), %xmm0
> jbe -0x76
>
> FloatClassCheck::isInfiniteCMov:
> Baseline:
> vucomiss -0x128(%rip), %xmm1
> jp 0x2
> je 0x16
> vucomiss -0x130(%rip), %xmm1
> jp 0x2
> je 0xa
>
> #8459:
> vmovd %xmm5, %eax
> andl $0x7fffffff, %eax
> cmpl $0x7f800000, %eax
> sete %bpl
> andl $0xff, %ebp
> testl %ebp, %ebp
> movl $0x7, %eax
> cmovnel %ebx, %eax
>
> #8525:
> vandps -0xcefc3(%rip), %xmm0, %xmm0
> vucomiss -0x12b(%rip), %xmm0
> movl $0x9, %esi
> cmovbel %r8d, %esi
>
> FloatClassCheck::isInfiniteStore:
> Baseline:
> vucomiss -0x128(%rip), %xmm0
> jp 0x2
> je 0x11
> vucomiss -0x130(%rip), %xmm0
> jp 0x2
> je 0x5
>
> #8459:
> vmovd %xmm2, %r8d
> andl $0x7fffffff, %r8d
> cmpl $0x7f800000, %r8d
> sete %r8b
> andl $0xff, %r8d
>
> #8525:
> vandps -0xcf2b9(%rip), %xmm0, %xmm0
> vucomiss -0x121(%rip), %xmm0
> movl $0x1, %r11d
> cmovbel %esi, %r11d
>
> FloatClassCheck::isNaNBranch:
> Baseline:
> vucomiss %xmm0, %xmm0
> jp 0x2
> je -0x64
>
> #8459:
> vmovd %xmm1, %r10d
> andl $0x7fffffff, %r10d
> cmpl $0x7f800000, %r10d
> seta %r11b
> andl $0xff, %r11d
> testl %r11d, %r11d
> je -0x87
>
> #8525:
> vucomiss %xmm1, %xmm1
> jnp -0x62
>
> FloatClassCheck::isNaNCMov:
> Baseline:
> vucomiss %xmm5, %xmm5
> jnp 0xa
> pushfq
> andq $-0xd5, (%rsp)
> popfq
> movl $0x7, %r9d
> cmovnel %r8d, %r9d
>
> #8459:
> vmovd %xmm4, %ebp
> andl $0x7fffffff, %ebp
> cmpl $0x7f800000, %ebp
> seta %al
> andl $0xff, %eax
> testl %eax, %eax
> movl $0x7, %ebp
> cmovnel %ebx, %ebp
>
> #8525:
> vucomiss %xmm4, %xmm4
> movl $0x7, %r9d
> cmovpl %r8d, %r9d
>
> FloatClassCheck::isNaNStore:
> Baseline:
> vucomiss %xmm3, %xmm3
> jnp 0xa
> pushfq
> andq $-0xd5, (%rsp)
> popfq
> movl $0x1, %ebx
> cmovel %eax, %ebx
>
> #8459:
> vmovd %xmm6, %edi
> andl $0x7fffffff, %edi
> cmpl $0x7f800000, %edi
> seta %dil
> andl $0xff, %edi
>
> #8525:
> movl $0x1, %r9d
> vucomiss %xmm0, %xmm0
> cmovnpl %r10d, %r9d
> ```
>
> The assembly output for `DoubleClassCheck` is similar. Thanks.
Thanks for sharing the performance data. Your patch is showing` ~2.5x` improvement over the intrinsic for the case of `{Float/Double}.testIsNaNCMov`.
-------------
PR: https://git.openjdk.java.net/jdk/pull/8459
More information about the hotspot-compiler-dev
mailing list