RFR: 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" [v4]

Wed Jul 3 14:48:24 UTC 2024

On Tue, 2 Jul 2024 10:16:58 GMT, Andrew Haley <aph at openjdk.org> wrote:

>> Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains seven additional commits since the last revision:
>> 
>>  - Discard IndOffXX style and let legitimize_address() fix any out-of-range immediate offsets
>>  - Merge branch 'master' into fg8319690
>>  - Add the assertion back and merge matchrules with a better predicate
>>  - Merge branch 'master' into fg8319690
>>  - Remove unused immIOffset/immLOffset
>>  - Merge branch 'master' into fg8319690
>>  - 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug"
>>    
>>    On LP64 systems, if the heap can be moved into low virtual
>>    address space (below 4GB) and the heap size is smaller than the
>>    interesting threshold of 4 GB, we can use unscaled decoding
>>    pattern for narrow klass decoding. It means that a generic field
>>    reference can be decoded by:
>>    ```
>>    cast<64> (32-bit compressed reference) + field_offset
>>    ```
>>    
>>    When the `field_offset` is an immediate, on aarch64 platform, the
>>    unscaled decoding pattern can match perfectly with a direct
>>    addressing mode, i.e., `base_plus_offset`, supported by LDR/STR
>>    instructions. But for certain data width, not all immediates can
>>    be encoded in the instruction field of LDR/STR[1]. The ranges are
>>    different as data widths vary.
>>    
>>    For example, when we try to load a value of long type at offset of
>>    `1030`, the address expression is `(AddP (DecodeN base) 1030)`.
>>    Before the patch, the expression was matching with
>>    `operand indOffIN()`. But, for 64-bit LDR/STR, signed immediate
>>    byte offset must be in the range -256 to 255 or positive immediate
>>    byte offset must be a multiple of 8 in the range 0 to 32760[2].
>>    `1030` can't be encoded in the instruction field. So, after
>>    matching, when we do checking for instruction encoding, the
>>    assertion would fail.
>>    
>>    In this patch, we're going to filter out invalid immediates
>>    when deciding if current addressing mode can be matched as
>>    `base_plus_offset`. We introduce `indOffIN4/indOffLN4` and
>>    `indOffIN8/indOffLN8` for 32-bit data type and 64-bit data
>>    type separately in the patch. E.g., for `memory4`, we remove
>>    the generic `indOffIN/indOffLN`, which matches wrong unscaled
>>    immediate range, and replace them with `indOffIN4/indOffLN4`
>>    instead.
> ...
>
> I think this fixes it. `Unsafe` intrinsifies direct memory access using a `long` as the base address, and it seems C2 needs to be told that the conversion from long->ptr is a nop. Having said that, I'm not sure why this PR causes the extra register copy.
> 
> 
> diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
> index 985784c70fa..ed8cf0c698b 100644
> --- a/src/hotspot/cpu/aarch64/aarch64.ad
> +++ b/src/hotspot/cpu/aarch64/aarch64.ad
> @@ -5134,6 +5134,20 @@ operand indOffL(iRegP reg, immLOffset off)
>    %}
>  %}
>  
> +operand indOffX2P(iRegL reg, immLOffset off)
> +%{
> +  constraint(ALLOC_IN_RC(ptr_reg));
> +  match(AddP (CastX2P reg) off);
> +  op_cost(0);
> +  format %{ "[$reg, $off]" %}
> +  interface(MEMORY_INTER) %{
> +    base($reg);
> +    index(0xffffffff);
> +    scale(0x0);
> +    disp($off);
> +  %}
> +%}
> +
>  operand indirectN(iRegN reg)
>  %{
>    predicate(CompressedOops::shift() == 0);
> @@ -5469,7 +5483,7 @@ opclass vmem(indirect, indIndex, indOffI, indOffL, indOffIN, indOffLN);
>  // memory is used to define read/write location for load/store
>  // instruction defs. we can turn a memory op into an Address
>  
> -opclass memory(indirect, indIndexScaled, indIndexScaledI2L, indIndexI2L, indIndex, indOffI, indOffL,
> +opclass memory(indirect, indIndexScaled, indIndexScaledI2L, indIndexI2L, indIndex, indOffI, indOffL, indOffX2P,
>                 indirectN, indIndexScaledN, indIndexScaledI2LN, indIndexI2LN, indIndexN, indOffIN, indOffLN);

Hi @theRealAph , thanks for your work and kind explanation!

I tried to write a small case to reproduce the problem you found, with `java -XX:-TieredCompilation -XX:CompileCommand=compileonly,TestUnsafe*::"<clinit>" --add-exports java.base/jdk.internal.misc=ALL-UNNAMED  -XX:+PrintAssembly TestUnsafe`

import jdk.internal.misc.Unsafe;

public class TestUnsafe {

    public static final int LEN1 = 2040;
    public static final int LEN2 = 1020;

    static final Unsafe UNSAFE = Unsafe.getUnsafe();

    public static long lseed = 1;
    public static long lres = lseed;
    public static long off1 = 16;
    public static long off2 = 16;

    public static class TestLong {
        private static long address = UNSAFE.allocateMemory(LEN1);
        private static long heater = UNSAFE.allocateMemory(LEN2);

        static {
            for (int k = 0; k < 10_000_000; k++) {
                for (int i = 0; i < LEN2; i++) {
                    UNSAFE.putLong(heater+i, lseed);
                }
            }

            UNSAFE.putLong(address + off1 + 1030, lseed);
            UNSAFE.putLong(address + 1023, lseed);
            UNSAFE.putLong(address + off2 + 1001, lseed);
        }
    }

    static void test() {
        TestLong ta = new TestLong();
        lres = UNSAFE.getLong(ta.address + 1023);
        System.out.println(lres);
        lres = UNSAFE.getLong(ta.address + 127);
        System.out.println(lres);
    }

    public static void main(String[] strArr) {
        test();
    }
}

The existing code has the following OptoAssembly:

240 +   ldr  R10, [R15, #120]	# int ! Field: TestUnsafe$TestLong.address
244 +   ldr  R11, [R16, #136]	# int ! Field: TestUnsafe.off1
248 +   ldr  R12, [R16, #144]	# int ! Field: TestUnsafe.off2
248 +   add  R11, R11, R10
24c +   mov R11, R11	# long -> ptr
24c     add  R12, R12, R10
250 +   mov R10, R10	# long -> ptr
250     add R11, R11, #1030	# ptr
254 +   str  R17, [R11]	# int
258     add R10, R10, #1023	# ptr
25c +   str  R17, [R10]	# int
260     mov R10, R12	# long -> ptr
264 +   add R10, R10, #1001	# ptr
268     str  R17, [R10]	# int

Because we do nothing for `mov dst src` when `dst == src`, then we have assembly:

ldr	x10, [x15,#120]
ldp	x11, x12, [x16,#136]
add	x11, x11, x10
add	x12, x12, x10
add	x11, x11, #0x406
str	x17, [x11]
add	x10, x10, #0x3ff
str	x17, [x10]
mov	x10, x12
add	x10, x10, #0x3e9
str	x17, [x10]

We can still see the extra register copy. So, I guess it is not caused by this PR and it does exist before? Maybe, as you said, C2 needs to be told that the conversion from long->ptr is a nop. 

After applying the PR, we have:

240 +   ldr  R10, [R15, #120]	# int ! Field: TestUnsafe$TestLong.address
244 +   ldr  R11, [R16, #136]	# int ! Field: TestUnsafe.off1
248 +   add  R11, R11, R10
24c     ldr  R12, [R16, #144]	# int ! Field: TestUnsafe.off2
250 +   mov R11, R11	# long -> ptr
250 +   str  R17, [R11, #1030]	# int
258     add  R11, R12, R10
25c +   mov R10, R10	# long -> ptr
25c     str  R17, [R10, #1023]	# int
264 +   mov R10, R11	# long -> ptr
268     str  R17, [R10, #1001]	# int

After applying the PR with your fix, we have:

240 +   ldr  R10, [R15, #120]	# int ! Field: TestUnsafe$TestLong.address
244 +   ldr  R11, [R16, #136]	# int ! Field: TestUnsafe.off1
248 +   add  R11, R11, R10
24c     str  R17, [R11, #1030]	# int
254 +   ldr  R12, [R16, #144]	# int ! Field: TestUnsafe.off2
258 +   str  R17, [R10, #1023]	# int
260 +   add  R10, R12, R10
264     str  R17, [R10, #1001]	# int

I'll update the PR with your fix soon. Thanks!

-------------

PR Comment: https://git.openjdk.org/jdk/pull/16991#issuecomment-2206368740