RFR: 8285868: x86_64 intrinsics for floating point methods isNaN, isFinite and isInfinite [v8]

Sat May 21 09:47:55 UTC 2022

On Sat, 21 May 2022 07:40:20 GMT, Srinivas Vamsi Parasa <duke at openjdk.java.net> wrote:

>> Impressive. Few comments.
>> 
>> You are testing performance of storing `boolean` results into array but usually these Java methods used in conditions. Measuring that will be more real word case. For both case: with `avx512dq` On and OFF.
>> 
>> And you need to post you perf results at least in RFE. Please, also show what instructions are currently generated vs your changes. I don't get how you made `isNaN()` faster - you generate more instructions is seems.
>> 
>> Instead of 3 new Ideal nodes per type you can use one and store instrinsic id (or other enum) in its field which you can read in `.ad` file instructions. Instead I suggest to split those mach instructions based on `avx512dq` support to avoid unused registers killing.
>> 
>> Why Double type support is limited to LP64? Why there is no `x86_32.ad` changes?
>> 
>> You can reuse `tmp1` in `double_class_check()`.
>
> Hi Vladimir (@vnkozlov)
> 
> For 32bit, in the case of double, we see performance improvement using `vfpclasssd` instruction but **without** `vfpclassd`, we see **40% decrease** in performance for `isFinite()` compared to the original Java code. Below, is the code which implements the intrinsic using SSE.
> 
> Is it Ok to skip support for **non** `vfpclassd` for 32bit?
> 
> 
> void C2_MacroAssembler::double_class_check_sse(int opcode, XMMRegister src, Register dst, Register temp, Register temp1) {
>   int32_t POS_INF_HI = 0x7ff00000; // hi 32bits
>   int32_t KILL_SIGN_MASK_HI = 0x7fffffff; // hi 32 bits
> 
>   pshuflw(src, src, 0x4e); //switch hi to lo
>   movdl(temp, src);
>   movl(temp1, KILL_SIGN_MASK_HI);
>   andl(temp, temp1);
>   movl(temp1, POS_INF_HI);
>   cmpl(temp, temp1);
>   switch (opcode) {
>     case Op_IsFiniteD:
>       setb(Assembler::below, dst);
>       break;
>     case Op_IsInfiniteD:
>       setb(Assembler::equal, dst);
>       break;
>     case Op_IsNaND:
>       setb(Assembler::above, dst);
>       break;
>     default:
>       assert(false, "%s", NodeClassNames[opcode]);
>   }
>   andl(dst, 0xff);
> }

@vamsi-parasa I modified your benchmark to emulate more use cases of these functions and run it on the baseline, #8525 with modified `isInfinite` (to use `Math.abs(v) > MAX_VALUE` instead) and this patch. The result is as follows, the source code and the assembly for the interesting parts will be shown later

                                                       Baseline         #8459           #8525
    Benchmark                             Mode  Cnt  Score   Error  Score   Error   Score   Error  Units
    FloatClassCheck.testIsFiniteBranch    avgt    5  2.522 ± 0.094  2.564 ± 0.187   2.512 ± 0.137  ns/op
    FloatClassCheck.testIsFiniteCMov      avgt    5  0.479 ± 0.014  0.786 ± 0.009   0.475 ± 0.005  ns/op
    FloatClassCheck.testIsFiniteStore     avgt    5  0.482 ± 0.010  0.603 ± 0.026   0.480 ± 0.006  ns/op
    FloatClassCheck.testIsInfiniteBranch  avgt    5  1.921 ± 0.043  1.778 ± 0.023   1.767 ± 0.039  ns/op
    FloatClassCheck.testIsInfiniteCMov    avgt    5  1.124 ± 0.045  0.787 ± 0.013   0.622 ± 0.019  ns/op
    FloatClassCheck.testIsInfiniteStore   avgt    5  1.195 ± 0.033  0.602 ± 0.015   0.625 ± 0.033  ns/op
    FloatClassCheck.testIsNaNBranch       avgt    5  1.896 ± 0.182  2.097 ± 0.216   1.725 ± 0.222  ns/op
    FloatClassCheck.testIsNaNCMov         avgt    5  2.956 ± 0.021  0.856 ± 0.003   0.390 ± 0.006  ns/op
    FloatClassCheck.testIsNaNStore        avgt    5  3.024 ± 0.071  0.741 ± 0.139   0.410 ± 0.008  ns/op

                                                        Baseline         #8459           #8525
    Benchmark                              Mode  Cnt  Score   Error  Score   Error  Score   Error  Units
    DoubleClassCheck.testIsFiniteBranch    avgt    5  2.566 ± 0.105  3.023 ± 0.117  2.603 ± 0.137  ns/op
    DoubleClassCheck.testIsFiniteCMov      avgt    5  0.481 ± 0.010  0.978 ± 0.011  0.485 ± 0.018  ns/op
    DoubleClassCheck.testIsFiniteStore     avgt    5  0.480 ± 0.012  0.943 ± 0.012  0.486 ± 0.011  ns/op
    DoubleClassCheck.testIsInfiniteBranch  avgt    5  1.907 ± 0.081  1.917 ± 0.065  1.808 ± 0.039  ns/op
    DoubleClassCheck.testIsInfiniteCMov    avgt    5  1.111 ± 0.028  0.982 ± 0.019  0.630 ± 0.017  ns/op
    DoubleClassCheck.testIsInfiniteStore   avgt    5  1.134 ± 0.011  0.944 ± 0.017  0.630 ± 0.009  ns/op
    DoubleClassCheck.testIsNaNBranch       avgt    5  1.926 ± 0.218  2.193 ± 0.045  1.767 ± 0.142  ns/op
    DoubleClassCheck.testIsNaNCMov         avgt    5  2.944 ± 0.020  1.047 ± 0.012  0.392 ± 0.009  ns/op
    DoubleClassCheck.testIsNaNStore        avgt    5  3.011 ± 0.065  0.946 ± 0.029  0.411 ± 0.004  ns/op

The source code for `FloatClassCheck`, that of `DoubleClassCheck` is similar

    RandomGenerator rng;
    static final int BUFFER_SIZE = 1024;
    float[] inputs;
    boolean[] storeOutputs;
    int[] cmovOutputs;
    int[] branchOutputs;

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    static int call() {
        return 1;
    }

    @Setup
    public void setup() {
        storeOutputs = new boolean[BUFFER_SIZE];
        cmovOutputs = new int[BUFFER_SIZE];
        branchOutputs = new int[BUFFER_SIZE];
        inputs = new float[BUFFER_SIZE];
        RandomGenerator rng = RandomGeneratorFactory.getDefault().create(0);
        float input;
        for (int i = 0; i < BUFFER_SIZE; i++) {
            if (i % 5 == 0) {
                input = (i%2 == 0) ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
            }
            else if (i % 3 == 0) input = Float.NaN;
            else input = rng.nextFloat();
            inputs[i] = input;
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsFiniteStore() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            storeOutputs[i] = Float.isFinite(inputs[i]);
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsInfiniteStore() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            storeOutputs[i] = Float.isInfinite(inputs[i]);
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsNaNStore() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            storeOutputs[i] = Float.isNaN(inputs[i]);
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsFiniteCMov() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            cmovOutputs[i] = Float.isFinite(inputs[i]) ? 9 : 7;
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsInfiniteCMov() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            cmovOutputs[i] = Float.isInfinite(inputs[i]) ? 9 : 7;
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsNaNCMov() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            cmovOutputs[i] = Float.isNaN(inputs[i]) ? 9 : 7;
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsFiniteBranch() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            cmovOutputs[i] = Float.isFinite(inputs[i]) ? call() : 7;
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsInfiniteBranch() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            cmovOutputs[i] = Float.isInfinite(inputs[i]) ? call() : 7;
        }
    }

    @Benchmark
    @OperationsPerInvocation(BUFFER_SIZE)
    public void testIsNaNBranch() {
        for (int i = 0; i < BUFFER_SIZE; i++) {
            cmovOutputs[i] = Float.isNaN(inputs[i]) ? call() : 7;
        }
    }

The assembly of the interesting parts of the executions:

        FloatClassCheck::testIsFiniteBranch:
    Baseline, #8525:
    vandps  -0xd4791(%rip), %xmm0, %xmm1
    vucomiss        %xmm1, %xmm2
    jae     -0x77

    #8459
    vmovd   %xmm0, %r11d
    andl    $0x7fffffff, %r11d
    cmpl    $0x7f800000, %r11d
    setb    %r10b
    andl    $0xff, %r10d
    testl   %r10d, %r10d
    jne     -0x90

        FloatClassCheck::testIsFiniteCMov:
    Baseline, #8525:
    vandps  -0xcdcf6(%rip), %xmm6, %xmm7
    vucomiss        %xmm7, %xmm10
    movl    $0x9, %r10d
    cmovbl  %r14d, %r10d

    #8459:
    vmovd   %xmm4, %r9d
    andl    $0x7fffffff, %r9d
    cmpl    $0x7f800000, %r9d
    setb    %r8b
    andl    $0xff, %r8d
    testl   %r8d, %r8d
    movl    $0x7, %ebx
    cmovnel %r14d, %ebx

        FloatClassCheck::isFiniteStore:
    Baseline, #8525:
    vandps  -0xcfd74(%rip), %xmm3, %xmm3
    movl    $0x1, %r10d
    vucomiss        %xmm3, %xmm0
    cmovbl  %r9d, %r10d

    #8459:
    vmovd   %xmm6, %edi
    andl    $0x7fffffff, %edi
    cmpl    $0x7f800000, %edi
    setb    %dil
    andl    $0xff, %edi

        FloatClassCheck::isInfiniteBranch:
    Baseline:
    vucomiss        -0xc8(%rip), %xmm1
    jp      0x2
    je      0x20
    vucomiss        -0xd0(%rip), %xmm1
    nopl    (%rax,%rax)
    nop
    jp      -0x86
    jne     -0x8c

    #8459:
    vmovd   %xmm1, %r10d
    andl    $0x7fffffff, %r10d
    cmpl    $0x7f800000, %r10d
    sete    %r11b
    andl    $0xff, %r11d
    testl   %r11d, %r11d
    je      -0x87

    #8525:
    vandps  -0xce478(%rip), %xmm1, %xmm0
    nopl    (%rax,%rax)
    vucomiss        -0xc8(%rip), %xmm0
    jbe     -0x76

        FloatClassCheck::isInfiniteCMov:
    Baseline:
    vucomiss        -0x128(%rip), %xmm1
    jp      0x2
    je      0x16
    vucomiss        -0x130(%rip), %xmm1
    jp      0x2
    je      0xa

    #8459:
    vmovd   %xmm5, %eax
    andl    $0x7fffffff, %eax
    cmpl    $0x7f800000, %eax
    sete    %bpl
    andl    $0xff, %ebp
    testl   %ebp, %ebp
    movl    $0x7, %eax
    cmovnel %ebx, %eax

    #8525:
    vandps  -0xcefc3(%rip), %xmm0, %xmm0
    vucomiss        -0x12b(%rip), %xmm0
    movl    $0x9, %esi
    cmovbel %r8d, %esi

        FloatClassCheck::isInfiniteStore:
    Baseline:
    vucomiss        -0x128(%rip), %xmm0
    jp      0x2
    je      0x11
    vucomiss        -0x130(%rip), %xmm0
    jp      0x2
    je      0x5

    #8459:
    vmovd   %xmm2, %r8d
    andl    $0x7fffffff, %r8d
    cmpl    $0x7f800000, %r8d
    sete    %r8b
    andl    $0xff, %r8d

    #8525:
    vandps  -0xcf2b9(%rip), %xmm0, %xmm0
    vucomiss        -0x121(%rip), %xmm0
    movl    $0x1, %r11d
    cmovbel %esi, %r11d

        FloatClassCheck::isNaNBranch:
    Baseline:
    vucomiss        %xmm0, %xmm0
    jp      0x2
    je      -0x64

    #8459:
    vmovd   %xmm1, %r10d
    andl    $0x7fffffff, %r10d
    cmpl    $0x7f800000, %r10d
    seta    %r11b
    andl    $0xff, %r11d
    testl   %r11d, %r11d
    je      -0x87

    #8525:
    vucomiss        %xmm1, %xmm1
    jnp     -0x62

        FloatClassCheck::isNaNCMov:
    Baseline:
    vucomiss        %xmm5, %xmm5
    jnp     0xa
    pushfq
    andq    $-0xd5, (%rsp)
    popfq
    movl    $0x7, %r9d
    cmovnel %r8d, %r9d

    #8459:
    vmovd   %xmm4, %ebp
    andl    $0x7fffffff, %ebp
    cmpl    $0x7f800000, %ebp
    seta    %al
    andl    $0xff, %eax
    testl   %eax, %eax
    movl    $0x7, %ebp
    cmovnel %ebx, %ebp

    #8525:
    vucomiss        %xmm4, %xmm4
    movl    $0x7, %r9d
    cmovpl  %r8d, %r9d

        FloatClassCheck::isNaNStore:
    Baseline:
    vucomiss        %xmm3, %xmm3
    jnp     0xa
    pushfq
    andq    $-0xd5, (%rsp)
    popfq
    movl    $0x1, %ebx
    cmovel  %eax, %ebx

    #8459:
    vmovd   %xmm6, %edi
    andl    $0x7fffffff, %edi
    cmpl    $0x7f800000, %edi
    seta    %dil
    andl    $0xff, %edi

    #8525:
    movl    $0x1, %r9d
    vucomiss        %xmm0, %xmm0
    cmovnpl %r10d, %r9d

The assembly output for `DoubleClassCheck` is similar. Thanks.

-------------

PR: https://git.openjdk.java.net/jdk/pull/8459