RFR: 8305895: Implement JEP 450: Compact Object Headers (Experimental) [v19]

Mon Nov 18 16:20:34 UTC 2024

On Mon, 18 Nov 2024 15:20:17 GMT, Quan Anh Mai <qamai at openjdk.org> wrote:

>> @merykitty I guess we can always use [vmovdqu](https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64).
>> 
>> And in fact that is exactly what we do:
>> 
>> public class Test {
>>     static int RANGE = 1024*1024;
>> 
>>     public static void main(String[] args) {
>>         byte[] aB = new byte[RANGE];
>>         byte[] bB = new byte[RANGE];
>>         for (int i = 0; i < 100_000; i++) {
>>             test1(aB, bB);
>>         }
>>     }
>> 
>>     static void test1(byte[] a, byte[] b) {                                    
>>         for (int i = 0; i < RANGE; i++) {                                                    
>>             a[i] = b[i];
>>         }                                                                                     
>>     }
>> }
>> 
>> `../java -XX:CompileCommand=compileonly,Test::test* -XX:CompileCommand=printcompilation,Test::test* -XX:+TraceLoopOpts -XX:-TraceSuperWord -XX:+TraceNewVectors -Xbatch -XX:+AlignVector -XX:CompileCommand=compileonly,Test::test* -XX:CompileCommand=printassembly,Test::test* Test.java`
>> 
>> 
>>  ;; B20: #      out( B20 B21 ) <- in( B19 B20 ) Loop( B20-B20 inner main of N178 strip mined) Freq: 8.13586e+09
>>   0x00007fc3a4bb0780:   movslq %ebx,%rdi
>>   0x00007fc3a4bb0783:   movslq %ebx,%r14
>>   0x00007fc3a4bb0786:   vmovdqu32 0x10(%r13,%r14,1),%zmm1
>>   0x00007fc3a4bb0791:   vmovdqu32 %zmm1,0x10(%r9,%r14,1)
>>   0x00007fc3a4bb079c:   vmovdqu32 0x50(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb07a7:   vmovdqu32 %zmm1,0x50(%r9,%rdi,1)
>>   0x00007fc3a4bb07b2:   vmovdqu32 0x90(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb07bd:   vmovdqu32 %zmm1,0x90(%r9,%rdi,1)
>>   0x00007fc3a4bb07c8:   vmovdqu32 0xd0(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb07d3:   vmovdqu32 %zmm1,0xd0(%r9,%rdi,1)
>>   0x00007fc3a4bb07de:   vmovdqu32 0x110(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb07e9:   vmovdqu32 %zmm1,0x110(%r9,%rdi,1)
>>   0x00007fc3a4bb07f4:   vmovdqu32 0x150(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb07ff:   vmovdqu32 %zmm1,0x150(%r9,%rdi,1)
>>   0x00007fc3a4bb080a:   vmovdqu32 0x190(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb0815:   vmovdqu32 %zmm1,0x190(%r9,%rdi,1)
>>   0x00007fc3a4bb0820:   vmovdqu32 0x1d0(%r13,%rdi,1),%zmm1
>>   0x00007fc3a4bb082b:   vmovdqu32 %zmm1,0x1d0(%r9,%rdi,1)   ;*bastore {reexecute=0 rethrow=0 return_oop=0}
>>                                                             ; - Test::test1 at 14 (line 14)
>>   0x00007fc3a4bb0836:   add    $0x200,%ebx                  ;*iinc {reexecute=0 rethrow=0 return_oop=0}
>>                     ...
>
> @eme64 What I mean here is that `AlignVector` seems useless because the accesses are going to be misaligned either way.

@merykitty FYI:

`src/hotspot/share/opto/vectorization.hpp:  static bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }`

The relevant code:

src/hotspot/cpu/x86/matcher_x86.hpp:  static constexpr bool misaligned_vectors_ok() {
  // x86 supports misaligned vectors store/load.
  static constexpr bool misaligned_vectors_ok() {
    return true;
  }

src/hotspot/cpu/ppc/matcher_ppc.hpp:  static constexpr bool misaligned_vectors_ok() {
  // PPC implementation uses VSX load/store instructions (if
  // SuperwordUseVSX) which support 4 byte but not arbitrary alignment
  static constexpr bool misaligned_vectors_ok() {
    return false;
  }

src/hotspot/cpu/aarch64/matcher_aarch64.hpp:  static constexpr bool misaligned_vectors_ok() {
  // aarch64 supports misaligned vectors store/load.
  static constexpr bool misaligned_vectors_ok() {
    return true;
  }

src/hotspot/cpu/s390/matcher_s390.hpp:  static constexpr bool misaligned_vectors_ok() {
  // z/Architecture does support misaligned store/load at minimal extra cost.
  static constexpr bool misaligned_vectors_ok() {
    return true;
  }

src/hotspot/cpu/arm/matcher_arm.hpp:  static constexpr bool misaligned_vectors_ok() {
  // ARM doesn't support misaligned vectors store/load.
  static constexpr bool misaligned_vectors_ok() {
    return false;
  }

src/hotspot/cpu/riscv/matcher_riscv.hpp:  static constexpr bool misaligned_vectors_ok() {
  // riscv supports misaligned vectors store/load.
  static constexpr bool misaligned_vectors_ok() {
    return true;
  }

We can see that only PPC and ARM32 have such strict alignment requirements. And it turns out that PPC only needs 4-byte alignment, and ARM32 is fine with 8-byte alignment. So all of our platforms do not necessarily need full vector-width alignment.

-------------

PR Comment: https://git.openjdk.org/jdk/pull/20677#issuecomment-2483505834