RFR: 8282966: AArch64: Optimize VectorMask.toLong with SVE2

Thu Apr 21 12:24:46 UTC 2022

This patch optimizes the backend implementation of VectorMaskToLong for
AArch64, given a more efficient approach to mov value bits from
predicate register to general purpose register as x86 PMOVMSK[1] does,
by using BEXT[2] which is available in SVE2.

With this patch, the final code (input mask is byte type with
SPECIESE_512, generated on an SVE vector reg size of 512-bit QEMU
emulator) changes as below:

Before:

        mov     z16.b, p0/z, #1
        fmov    x0, d16
        orr     x0, x0, x0, lsr #7
        orr     x0, x0, x0, lsr #14
        orr     x0, x0, x0, lsr #28
        and     x0, x0, #0xff
        fmov    x8, v16.d[1]
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #8

        orr     x8, xzr, #0x2
        whilele p1.d, xzr, x8
        lastb   x8, p1, z16.d
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #16

        orr     x8, xzr, #0x3
        whilele p1.d, xzr, x8
        lastb   x8, p1, z16.d
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #24

        orr     x8, xzr, #0x4
        whilele p1.d, xzr, x8
        lastb   x8, p1, z16.d
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #32

        mov     x8, #0x5
        whilele p1.d, xzr, x8
        lastb   x8, p1, z16.d
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #40

        orr     x8, xzr, #0x6
        whilele p1.d, xzr, x8
        lastb   x8, p1, z16.d
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #48

        orr     x8, xzr, #0x7
        whilele p1.d, xzr, x8
        lastb   x8, p1, z16.d
        orr     x8, x8, x8, lsr #7
        orr     x8, x8, x8, lsr #14
        orr     x8, x8, x8, lsr #28
        and     x8, x8, #0xff
        orr     x0, x0, x8, lsl #56

After:

        mov     z16.b, p0/z, #1
        mov     z17.b, #1
        bext    z16.d, z16.d, z17.d
        mov     z17.d, #0
        uzp1    z16.s, z16.s, z17.s
        uzp1    z16.h, z16.h, z17.h
        uzp1    z16.b, z16.b, z17.b
        mov     x0, v16.d[0]

[1] https://www.felixcloutier.com/x86/pmovmskb
[2] https://developer.arm.com/documentation/ddi0602/2020-12/SVE-Instructions/BEXT--Gather-lower-bits-from-positions-selected-by-bitmask-

-------------

Commit messages:
 - 8282966: AArch64: Optimize VectorMask.toLong with SVE2

Changes: https://git.openjdk.java.net/jdk/pull/8337/files
 Webrev: https://webrevs.openjdk.java.net/?repo=jdk&pr=8337&range=00
  Issue: https://bugs.openjdk.java.net/browse/JDK-8282966
  Stats: 144 lines in 7 files changed: 102 ins; 3 del; 39 mod
  Patch: https://git.openjdk.java.net/jdk/pull/8337.diff
  Fetch: git fetch https://git.openjdk.java.net/jdk pull/8337/head:pull/8337

PR: https://git.openjdk.java.net/jdk/pull/8337