RFR: 8285868: x86_64 intrinsics for floating point methods isNaN, isFinite and isInfinite [v8]

Mon May 23 05:15:52 UTC 2022

On Mon, 23 May 2022 04:52:41 GMT, Srinivas Vamsi Parasa <duke at openjdk.java.net> wrote:

>>> For 32bit, in the case of double, we see performance improvement using `vfpclasssd` instruction but **without** `vfpclassd`, we see **40% decrease** in performance for `isFinite()` compared to the original Java code. Below, is the code which implements the intrinsic using SSE.
>>> 
>>> Is it Ok to skip support for **non** `vfpclassd` for 32bit?
>> 
>> Yes, but add comment about that. Also for 32-bit you need to check SSE2 support which is required by `pshuflw`.
>
>> > For 32bit, in the case of double, we see performance improvement using `vfpclasssd` instruction but **without** `vfpclassd`, we see **40% decrease** in performance for `isFinite()` compared to the original Java code. Below, is the code which implements the intrinsic using SSE.
>> > Is it Ok to skip support for **non** `vfpclassd` for 32bit?
>> 
>> Yes, but add comment about that. Also for 32-bit you need to check SSE2 support which is required by `pshuflw`.
> 
> Thanks Vladimir! Will add a comment that the intrinsic doesn't give speedup without `vfpclasssd`. 
> Yes, the check for `predicate(UseSSE>=2)` was added in the macro shown below.
> 
> instruct DoubleClassCheck_reg_reg_sse(rRegI dst, regD src, rRegI tmp, rRegI tmp1, rFlagsReg cr)
> %{
>   predicate(UseSSE>=2);
>   match(Set dst (IsInfiniteD src));
>   match(Set dst (IsNaND src));
>   match(Set dst (IsFiniteD src));
>   effect(TEMP tmp, TEMP tmp1, KILL cr);
>   format %{ "double_class_check $dst, $src" %}
>   ins_encode %{
>     int opcode = this->ideal_Opcode();
>     __ double_class_check_sse(opcode, $src$$XMMRegister, $dst$$Register, $tmp$$Register,
>                 $tmp1$$Register);
>   %}
>   ins_pipe(pipe_slow);
> %}

> @vamsi-parasa I modified your benchmark to emulate more use cases of these functions and run it on the baseline, #8525 with modified `isInfinite` (to use `Math.abs(v) > MAX_VALUE` instead) and this patch. The result is as follows, the source code and the assembly for the interesting parts will be shown later
> 
> ```
>                                                    Baseline         #8459           #8525
> Benchmark                             Mode  Cnt  Score   Error  Score   Error   Score   Error  Units
> FloatClassCheck.testIsFiniteBranch    avgt    5  2.522 ± 0.094  2.564 ± 0.187   2.512 ± 0.137  ns/op
> FloatClassCheck.testIsFiniteCMov      avgt    5  0.479 ± 0.014  0.786 ± 0.009   0.475 ± 0.005  ns/op
> FloatClassCheck.testIsFiniteStore     avgt    5  0.482 ± 0.010  0.603 ± 0.026   0.480 ± 0.006  ns/op
> FloatClassCheck.testIsInfiniteBranch  avgt    5  1.921 ± 0.043  1.778 ± 0.023   1.767 ± 0.039  ns/op
> FloatClassCheck.testIsInfiniteCMov    avgt    5  1.124 ± 0.045  0.787 ± 0.013   0.622 ± 0.019  ns/op
> FloatClassCheck.testIsInfiniteStore   avgt    5  1.195 ± 0.033  0.602 ± 0.015   0.625 ± 0.033  ns/op
> FloatClassCheck.testIsNaNBranch       avgt    5  1.896 ± 0.182  2.097 ± 0.216   1.725 ± 0.222  ns/op
> FloatClassCheck.testIsNaNCMov         avgt    5  2.956 ± 0.021  0.856 ± 0.003   0.390 ± 0.006  ns/op
> FloatClassCheck.testIsNaNStore        avgt    5  3.024 ± 0.071  0.741 ± 0.139   0.410 ± 0.008  ns/op
> 
>                                                     Baseline         #8459           #8525
> Benchmark                              Mode  Cnt  Score   Error  Score   Error  Score   Error  Units
> DoubleClassCheck.testIsFiniteBranch    avgt    5  2.566 ± 0.105  3.023 ± 0.117  2.603 ± 0.137  ns/op
> DoubleClassCheck.testIsFiniteCMov      avgt    5  0.481 ± 0.010  0.978 ± 0.011  0.485 ± 0.018  ns/op
> DoubleClassCheck.testIsFiniteStore     avgt    5  0.480 ± 0.012  0.943 ± 0.012  0.486 ± 0.011  ns/op
> DoubleClassCheck.testIsInfiniteBranch  avgt    5  1.907 ± 0.081  1.917 ± 0.065  1.808 ± 0.039  ns/op
> DoubleClassCheck.testIsInfiniteCMov    avgt    5  1.111 ± 0.028  0.982 ± 0.019  0.630 ± 0.017  ns/op
> DoubleClassCheck.testIsInfiniteStore   avgt    5  1.134 ± 0.011  0.944 ± 0.017  0.630 ± 0.009  ns/op
> DoubleClassCheck.testIsNaNBranch       avgt    5  1.926 ± 0.218  2.193 ± 0.045  1.767 ± 0.142  ns/op
> DoubleClassCheck.testIsNaNCMov         avgt    5  2.944 ± 0.020  1.047 ± 0.012  0.392 ± 0.009  ns/op
> DoubleClassCheck.testIsNaNStore        avgt    5  3.011 ± 0.065  0.946 ± 0.029  0.411 ± 0.004  ns/op
> ```
> 
> The source code for `FloatClassCheck`, that of `DoubleClassCheck` is similar
> 
> ```
> RandomGenerator rng;
> static final int BUFFER_SIZE = 1024;
> float[] inputs;
> boolean[] storeOutputs;
> int[] cmovOutputs;
> int[] branchOutputs;
> 
> @CompilerControl(CompilerControl.Mode.DONT_INLINE)
> static int call() {
>     return 1;
> }
> 
> @Setup
> public void setup() {
>     storeOutputs = new boolean[BUFFER_SIZE];
>     cmovOutputs = new int[BUFFER_SIZE];
>     branchOutputs = new int[BUFFER_SIZE];
>     inputs = new float[BUFFER_SIZE];
>     RandomGenerator rng = RandomGeneratorFactory.getDefault().create(0);
>     float input;
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         if (i % 5 == 0) {
>             input = (i%2 == 0) ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
>         }
>         else if (i % 3 == 0) input = Float.NaN;
>         else input = rng.nextFloat();
>         inputs[i] = input;
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsFiniteStore() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         storeOutputs[i] = Float.isFinite(inputs[i]);
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsInfiniteStore() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         storeOutputs[i] = Float.isInfinite(inputs[i]);
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsNaNStore() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         storeOutputs[i] = Float.isNaN(inputs[i]);
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsFiniteCMov() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         cmovOutputs[i] = Float.isFinite(inputs[i]) ? 9 : 7;
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsInfiniteCMov() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         cmovOutputs[i] = Float.isInfinite(inputs[i]) ? 9 : 7;
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsNaNCMov() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         cmovOutputs[i] = Float.isNaN(inputs[i]) ? 9 : 7;
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsFiniteBranch() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         cmovOutputs[i] = Float.isFinite(inputs[i]) ? call() : 7;
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsInfiniteBranch() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         cmovOutputs[i] = Float.isInfinite(inputs[i]) ? call() : 7;
>     }
> }
> 
> @Benchmark
> @OperationsPerInvocation(BUFFER_SIZE)
> public void testIsNaNBranch() {
>     for (int i = 0; i < BUFFER_SIZE; i++) {
>         cmovOutputs[i] = Float.isNaN(inputs[i]) ? call() : 7;
>     }
> }
> ```
> 
> The assembly of the interesting parts of the executions:
> 
> ```
>     FloatClassCheck::testIsFiniteBranch:
> Baseline, #8525:
> vandps  -0xd4791(%rip), %xmm0, %xmm1
> vucomiss        %xmm1, %xmm2
> jae     -0x77
> 
> #8459
> vmovd   %xmm0, %r11d
> andl    $0x7fffffff, %r11d
> cmpl    $0x7f800000, %r11d
> setb    %r10b
> andl    $0xff, %r10d
> testl   %r10d, %r10d
> jne     -0x90
> 
>     FloatClassCheck::testIsFiniteCMov:
> Baseline, #8525:
> vandps  -0xcdcf6(%rip), %xmm6, %xmm7
> vucomiss        %xmm7, %xmm10
> movl    $0x9, %r10d
> cmovbl  %r14d, %r10d
> 
> #8459:
> vmovd   %xmm4, %r9d
> andl    $0x7fffffff, %r9d
> cmpl    $0x7f800000, %r9d
> setb    %r8b
> andl    $0xff, %r8d
> testl   %r8d, %r8d
> movl    $0x7, %ebx
> cmovnel %r14d, %ebx
> 
>     FloatClassCheck::isFiniteStore:
> Baseline, #8525:
> vandps  -0xcfd74(%rip), %xmm3, %xmm3
> movl    $0x1, %r10d
> vucomiss        %xmm3, %xmm0
> cmovbl  %r9d, %r10d
> 
> #8459:
> vmovd   %xmm6, %edi
> andl    $0x7fffffff, %edi
> cmpl    $0x7f800000, %edi
> setb    %dil
> andl    $0xff, %edi
> 
>     FloatClassCheck::isInfiniteBranch:
> Baseline:
> vucomiss        -0xc8(%rip), %xmm1
> jp      0x2
> je      0x20
> vucomiss        -0xd0(%rip), %xmm1
> nopl    (%rax,%rax)
> nop
> jp      -0x86
> jne     -0x8c
> 
> #8459:
> vmovd   %xmm1, %r10d
> andl    $0x7fffffff, %r10d
> cmpl    $0x7f800000, %r10d
> sete    %r11b
> andl    $0xff, %r11d
> testl   %r11d, %r11d
> je      -0x87
> 
> #8525:
> vandps  -0xce478(%rip), %xmm1, %xmm0
> nopl    (%rax,%rax)
> vucomiss        -0xc8(%rip), %xmm0
> jbe     -0x76
> 
>     FloatClassCheck::isInfiniteCMov:
> Baseline:
> vucomiss        -0x128(%rip), %xmm1
> jp      0x2
> je      0x16
> vucomiss        -0x130(%rip), %xmm1
> jp      0x2
> je      0xa
> 
> #8459:
> vmovd   %xmm5, %eax
> andl    $0x7fffffff, %eax
> cmpl    $0x7f800000, %eax
> sete    %bpl
> andl    $0xff, %ebp
> testl   %ebp, %ebp
> movl    $0x7, %eax
> cmovnel %ebx, %eax
> 
> #8525:
> vandps  -0xcefc3(%rip), %xmm0, %xmm0
> vucomiss        -0x12b(%rip), %xmm0
> movl    $0x9, %esi
> cmovbel %r8d, %esi
> 
>     FloatClassCheck::isInfiniteStore:
> Baseline:
> vucomiss        -0x128(%rip), %xmm0
> jp      0x2
> je      0x11
> vucomiss        -0x130(%rip), %xmm0
> jp      0x2
> je      0x5
> 
> #8459:
> vmovd   %xmm2, %r8d
> andl    $0x7fffffff, %r8d
> cmpl    $0x7f800000, %r8d
> sete    %r8b
> andl    $0xff, %r8d
> 
> #8525:
> vandps  -0xcf2b9(%rip), %xmm0, %xmm0
> vucomiss        -0x121(%rip), %xmm0
> movl    $0x1, %r11d
> cmovbel %esi, %r11d
> 
>     FloatClassCheck::isNaNBranch:
> Baseline:
> vucomiss        %xmm0, %xmm0
> jp      0x2
> je      -0x64
> 
> #8459:
> vmovd   %xmm1, %r10d
> andl    $0x7fffffff, %r10d
> cmpl    $0x7f800000, %r10d
> seta    %r11b
> andl    $0xff, %r11d
> testl   %r11d, %r11d
> je      -0x87
> 
> #8525:
> vucomiss        %xmm1, %xmm1
> jnp     -0x62
> 
>     FloatClassCheck::isNaNCMov:
> Baseline:
> vucomiss        %xmm5, %xmm5
> jnp     0xa
> pushfq
> andq    $-0xd5, (%rsp)
> popfq
> movl    $0x7, %r9d
> cmovnel %r8d, %r9d
> 
> #8459:
> vmovd   %xmm4, %ebp
> andl    $0x7fffffff, %ebp
> cmpl    $0x7f800000, %ebp
> seta    %al
> andl    $0xff, %eax
> testl   %eax, %eax
> movl    $0x7, %ebp
> cmovnel %ebx, %ebp
> 
> #8525:
> vucomiss        %xmm4, %xmm4
> movl    $0x7, %r9d
> cmovpl  %r8d, %r9d
> 
>     FloatClassCheck::isNaNStore:
> Baseline:
> vucomiss        %xmm3, %xmm3
> jnp     0xa
> pushfq
> andq    $-0xd5, (%rsp)
> popfq
> movl    $0x1, %ebx
> cmovel  %eax, %ebx
> 
> #8459:
> vmovd   %xmm6, %edi
> andl    $0x7fffffff, %edi
> cmpl    $0x7f800000, %edi
> seta    %dil
> andl    $0xff, %edi
> 
> #8525:
> movl    $0x1, %r9d
> vucomiss        %xmm0, %xmm0
> cmovnpl %r10d, %r9d
> ```
> 
> The assembly output for `DoubleClassCheck` is similar. Thanks.

Thanks for sharing the performance data. Your patch is showing` ~2.5x` improvement over the intrinsic for the case of `{Float/Double}.testIsNaNCMov`.

-------------

PR: https://git.openjdk.java.net/jdk/pull/8459