RFR: 8282966: AArch64: Optimize VectorMask.toLong with SVE2 [v2]
Nick Gasson
ngasson at openjdk.java.net
Wed May 11 10:03:47 UTC 2022
On Thu, 5 May 2022 01:31:52 GMT, Eric Liu <eliu at openjdk.org> wrote:
>> This patch optimizes the backend implementation of VectorMaskToLong for
>> AArch64, given a more efficient approach to mov value bits from
>> predicate register to general purpose register as x86 PMOVMSK[1] does,
>> by using BEXT[2] which is available in SVE2.
>>
>> With this patch, the final code (input mask is byte type with
>> SPECIESE_512, generated on an SVE vector reg size of 512-bit QEMU
>> emulator) changes as below:
>>
>> Before:
>>
>> mov z16.b, p0/z, #1
>> fmov x0, d16
>> orr x0, x0, x0, lsr #7
>> orr x0, x0, x0, lsr #14
>> orr x0, x0, x0, lsr #28
>> and x0, x0, #0xff
>> fmov x8, v16.d[1]
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #8
>>
>> orr x8, xzr, #0x2
>> whilele p1.d, xzr, x8
>> lastb x8, p1, z16.d
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #16
>>
>> orr x8, xzr, #0x3
>> whilele p1.d, xzr, x8
>> lastb x8, p1, z16.d
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #24
>>
>> orr x8, xzr, #0x4
>> whilele p1.d, xzr, x8
>> lastb x8, p1, z16.d
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #32
>>
>> mov x8, #0x5
>> whilele p1.d, xzr, x8
>> lastb x8, p1, z16.d
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #40
>>
>> orr x8, xzr, #0x6
>> whilele p1.d, xzr, x8
>> lastb x8, p1, z16.d
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #48
>>
>> orr x8, xzr, #0x7
>> whilele p1.d, xzr, x8
>> lastb x8, p1, z16.d
>> orr x8, x8, x8, lsr #7
>> orr x8, x8, x8, lsr #14
>> orr x8, x8, x8, lsr #28
>> and x8, x8, #0xff
>> orr x0, x0, x8, lsl #56
>>
>> After:
>>
>> mov z16.b, p0/z, #1
>> mov z17.b, #1
>> bext z16.d, z16.d, z17.d
>> mov z17.d, #0
>> uzp1 z16.s, z16.s, z17.s
>> uzp1 z16.h, z16.h, z17.h
>> uzp1 z16.b, z16.b, z17.b
>> mov x0, v16.d[0]
>>
>> [1] https://www.felixcloutier.com/x86/pmovmskb
>> [2] https://developer.arm.com/documentation/ddi0602/2020-12/SVE-Instructions/BEXT--Gather-lower-bits-from-positions-selected-by-bitmask-
>
> Eric Liu has updated the pull request with a new target base due to a merge or a rebase. The pull request now contains two commits:
>
> - Merge jdk:master
>
> Change-Id: Ifa60f3b79513c22dbf932f1da623289687bc1070
> - 8282966: AArch64: Optimize VectorMask.toLong with SVE2
>
> This patch optimizes the backend implementation of VectorMaskToLong for
> AArch64, given a more efficient approach to mov value bits from
> predicate register to general purpose register as x86 PMOVMSK[1] does,
> by using BEXT[2] which is available in SVE2.
>
> With this patch, the final code (input mask is byte type with
> SPECIESE_512, generated on an SVE vector reg size of 512-bit QEMU
> emulator) changes as below:
>
> Before:
>
> mov z16.b, p0/z, #1
> fmov x0, d16
> orr x0, x0, x0, lsr #7
> orr x0, x0, x0, lsr #14
> orr x0, x0, x0, lsr #28
> and x0, x0, #0xff
> fmov x8, v16.d[1]
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #8
>
> orr x8, xzr, #0x2
> whilele p1.d, xzr, x8
> lastb x8, p1, z16.d
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #16
>
> orr x8, xzr, #0x3
> whilele p1.d, xzr, x8
> lastb x8, p1, z16.d
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #24
>
> orr x8, xzr, #0x4
> whilele p1.d, xzr, x8
> lastb x8, p1, z16.d
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #32
>
> mov x8, #0x5
> whilele p1.d, xzr, x8
> lastb x8, p1, z16.d
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #40
>
> orr x8, xzr, #0x6
> whilele p1.d, xzr, x8
> lastb x8, p1, z16.d
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #48
>
> orr x8, xzr, #0x7
> whilele p1.d, xzr, x8
> lastb x8, p1, z16.d
> orr x8, x8, x8, lsr #7
> orr x8, x8, x8, lsr #14
> orr x8, x8, x8, lsr #28
> and x8, x8, #0xff
> orr x0, x0, x8, lsl #56
>
> After:
>
> mov z16.b, p0/z, #1
> mov z17.b, #1
> bext z16.d, z16.d, z17.d
> mov z17.d, #0
> uzp1 z16.s, z16.s, z17.s
> uzp1 z16.h, z16.h, z17.h
> uzp1 z16.b, z16.b, z17.b
> mov x0, v16.d[0]
>
> [1] https://www.felixcloutier.com/x86/pmovmskb
> [2] https://developer.arm.com/documentation/ddi0602/2020-12/SVE-Instructions/BEXT--Gather-lower-bits-from-positions-selected-by-bitmask-
>
> Change-Id: Ia983a20c89f76403e557ac21328f2f2e05dd08e0
Looks OK to me.
-------------
Marked as reviewed by ngasson (Reviewer).
PR: https://git.openjdk.java.net/jdk/pull/8337
More information about the hotspot-compiler-dev
mailing list