RFR(S): 8200477: Integer dot product no longer autovectorised

Roland Westrelin rwestrel at redhat.com
Mon Apr 23 16:18:57 UTC 2018


> I've tried to get this to work, both on AArch64 and x86, but without
> any success.  No matter what I do I do not get any vector insns.
> Please tell me what you do to run this JMH test and dee any vector
> instructions.  An example of what you see would be nice.  Does it
> need AVX, or some special level of SSE?

java -XX:-UseCountedLoopSafepoints -jar target/benchmarks.jar -prof perfasm -wi 5 -i 5 -f 1  org.sample.DotProduct.dotProduct

should work whether you applied the patch for this RFR or not.  See
below for a sample output.

Roland.

Hottest code regions (>10.00% "cycles" events):

....[Hottest Region 1]..............................................................................
c2, level 4, org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub, version 770 (961 bytes) 

            0x00007fb6a0bfee00: cmp    %esi,%r9d
            0x00007fb6a0bfee03: mov    $0x80000000,%r10d
  0.10%     0x00007fb6a0bfee09: cmovl  %r10d,%esi
            0x00007fb6a0bfee0d: cmp    %esi,%ebx
            0x00007fb6a0bfee0f: jge    0x00007fb6a0bfece6
            0x00007fb6a0bfee15: data16 data16 nopw 0x0(%rax,%rax,1)
                                                          ;*iload_2 {reexecute=0 rethrow=0 return_oop=0}
                                                          ; - org.sample.DotProduct::dotProduct at 13 (line 31)
                                                          ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
         ↗  0x00007fb6a0bfee20: vmovdqu 0x1f0(%r11,%rbx,4),%ymm0
  0.10%  │  0x00007fb6a0bfee2a: vpmulld 0x1f0(%rdi,%rbx,4),%ymm0,%ymm0
  0.83%  │  0x00007fb6a0bfee34: vmovdqu %ymm0,0x28(%rsp)
  0.06%  │  0x00007fb6a0bfee3a: vmovdqu 0x1d0(%r11,%rbx,4),%ymm0
  0.66%  │  0x00007fb6a0bfee44: vpmulld 0x1d0(%rdi,%rbx,4),%ymm0,%ymm0
  0.12%  │  0x00007fb6a0bfee4e: vmovdqu %ymm0,0x48(%rsp)
  0.42%  │  0x00007fb6a0bfee54: vmovdqu 0x1b0(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfee5e: vpmulld 0x1b0(%rdi,%rbx,4),%ymm0,%ymm0
  0.83%  │  0x00007fb6a0bfee68: vmovdqu %ymm0,0x68(%rsp)
  0.08%  │  0x00007fb6a0bfee6e: vmovdqu 0x190(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfee78: vpmulld 0x190(%rdi,%rbx,4),%ymm0,%ymm0
  1.23%  │  0x00007fb6a0bfee82: vmovdqu %ymm0,0x88(%rsp)
  0.14%  │  0x00007fb6a0bfee8b: vmovdqu 0x170(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfee95: vpmulld 0x170(%rdi,%rbx,4),%ymm0,%ymm0
  1.64%  │  0x00007fb6a0bfee9f: vmovdqu %ymm0,0xa8(%rsp)
  0.19%  │  0x00007fb6a0bfeea8: vmovdqu 0x150(%r11,%rbx,4),%ymm0
  0.35%  │  0x00007fb6a0bfeeb2: vpmulld 0x150(%rdi,%rbx,4),%ymm0,%ymm0
  0.25%  │  0x00007fb6a0bfeebc: vmovdqu %ymm0,0xc8(%rsp)
  0.44%  │  0x00007fb6a0bfeec5: vmovdqu 0x130(%r11,%rbx,4),%ymm0
  0.02%  │  0x00007fb6a0bfeecf: vpmulld 0x130(%rdi,%rbx,4),%ymm0,%ymm0
  0.68%  │  0x00007fb6a0bfeed9: vmovdqu %ymm0,0xe8(%rsp)
  0.10%  │  0x00007fb6a0bfeee2: vmovdqu 0x110(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfeeec: vpmulld 0x110(%rdi,%rbx,4),%ymm0,%ymm7
  0.66%  │  0x00007fb6a0bfeef6: vmovdqu 0xf0(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfef00: vpmulld 0xf0(%rdi,%rbx,4),%ymm0,%ymm3
  0.68%  │  0x00007fb6a0bfef0a: vmovdqu 0x10(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfef11: vpmulld 0x10(%rdi,%rbx,4),%ymm0,%ymm15
  2.06%  │  0x00007fb6a0bfef18: vmovdqu 0x30(%r11,%rbx,4),%ymm0
  0.02%  │  0x00007fb6a0bfef1f: vpmulld 0x30(%rdi,%rbx,4),%ymm0,%ymm1
  0.81%  │  0x00007fb6a0bfef26: vmovdqu 0x50(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfef2d: vpmulld 0x50(%rdi,%rbx,4),%ymm0,%ymm12
  0.89%  │  0x00007fb6a0bfef34: vmovdqu 0x70(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfef3b: vpmulld 0x70(%rdi,%rbx,4),%ymm0,%ymm9
  0.91%  │  0x00007fb6a0bfef42: vmovdqu 0x90(%r11,%rbx,4),%ymm0
         │  0x00007fb6a0bfef4c: vpmulld 0x90(%rdi,%rbx,4),%ymm0,%ymm8
  0.91%  │  0x00007fb6a0bfef56: vmovdqu 0xb0(%r11,%rbx,4),%ymm0
  2.74%  │  0x00007fb6a0bfef60: vpmulld 0xb0(%rdi,%rbx,4),%ymm0,%ymm6
  2.01%  │  0x00007fb6a0bfef6a: vmovdqu 0xd0(%r11,%rbx,4),%ymm0  ;*iaload {reexecute=0 rethrow=0 return_oop=0}
         │                                                ; - org.sample.DotProduct::dotProduct at 34 (line 32)
         │                                                ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
         │  0x00007fb6a0bfef74: vpmulld 0xd0(%rdi,%rbx,4),%ymm0,%ymm4
  0.85%  │  0x00007fb6a0bfef7e: vphaddd %ymm15,%ymm15,%ymm0
  0.04%  │  0x00007fb6a0bfef83: vphaddd %ymm2,%ymm0,%ymm0
  0.50%  │  0x00007fb6a0bfef88: vextracti128 $0x1,%ymm0,%xmm2
         │  0x00007fb6a0bfef8e: vpaddd %xmm2,%xmm0,%xmm0
  0.27%  │  0x00007fb6a0bfef92: vmovd  %edx,%xmm2
         │  0x00007fb6a0bfef96: vpaddd %xmm0,%xmm2,%xmm2
  0.06%  │  0x00007fb6a0bfef9a: vmovd  %xmm2,%r10d
         │  0x00007fb6a0bfef9f: vphaddd %ymm1,%ymm1,%ymm0
  0.60%  │  0x00007fb6a0bfefa4: vphaddd %ymm2,%ymm0,%ymm0
  0.54%  │  0x00007fb6a0bfefa9: vextracti128 $0x1,%ymm0,%xmm2
  0.83%  │  0x00007fb6a0bfefaf: vpaddd %xmm2,%xmm0,%xmm0
  0.50%  │  0x00007fb6a0bfefb3: vmovd  %r10d,%xmm2
  0.14%  │  0x00007fb6a0bfefb8: vpaddd %xmm0,%xmm2,%xmm2
  0.37%  │  0x00007fb6a0bfefbc: vmovd  %xmm2,%r10d
  0.60%  │  0x00007fb6a0bfefc1: vphaddd %ymm12,%ymm12,%ymm1
  0.17%  │  0x00007fb6a0bfefc6: vphaddd %ymm0,%ymm1,%ymm1
  0.64%  │  0x00007fb6a0bfefcb: vextracti128 $0x1,%ymm1,%xmm0
  1.45%  │  0x00007fb6a0bfefd1: vpaddd %xmm0,%xmm1,%xmm1
  0.50%  │  0x00007fb6a0bfefd5: vmovd  %r10d,%xmm0
         │  0x00007fb6a0bfefda: vpaddd %xmm1,%xmm0,%xmm0
  0.42%  │  0x00007fb6a0bfefde: vmovd  %xmm0,%r10d
  0.50%  │  0x00007fb6a0bfefe3: vphaddd %ymm9,%ymm9,%ymm0
  0.21%  │  0x00007fb6a0bfefe8: vphaddd %ymm2,%ymm0,%ymm0
  0.42%  │  0x00007fb6a0bfefed: vextracti128 $0x1,%ymm0,%xmm2
  0.91%  │  0x00007fb6a0bfeff3: vpaddd %xmm2,%xmm0,%xmm0
  0.25%  │  0x00007fb6a0bfeff7: vmovd  %r10d,%xmm2
  0.08%  │  0x00007fb6a0bfeffc: vpaddd %xmm0,%xmm2,%xmm2
  0.54%  │  0x00007fb6a0bff000: vmovd  %xmm2,%r10d
  0.56%  │  0x00007fb6a0bff005: vphaddd %ymm8,%ymm8,%ymm1
  0.12%  │  0x00007fb6a0bff00a: vphaddd %ymm0,%ymm1,%ymm1
  0.73%  │  0x00007fb6a0bff00f: vextracti128 $0x1,%ymm1,%xmm0
  1.02%  │  0x00007fb6a0bff015: vpaddd %xmm0,%xmm1,%xmm1
  0.56%  │  0x00007fb6a0bff019: vmovd  %r10d,%xmm0
         │  0x00007fb6a0bff01e: vpaddd %xmm1,%xmm0,%xmm0
  0.69%  │  0x00007fb6a0bff022: vmovd  %xmm0,%r10d
  0.35%  │  0x00007fb6a0bff027: vphaddd %ymm6,%ymm6,%ymm0
  0.15%  │  0x00007fb6a0bff02c: vphaddd %ymm2,%ymm0,%ymm0
  0.56%  │  0x00007fb6a0bff031: vextracti128 $0x1,%ymm0,%xmm2
  1.10%  │  0x00007fb6a0bff037: vpaddd %xmm2,%xmm0,%xmm0
  0.58%  │  0x00007fb6a0bff03b: vmovd  %r10d,%xmm2
  0.15%  │  0x00007fb6a0bff040: vpaddd %xmm0,%xmm2,%xmm2
  0.50%  │  0x00007fb6a0bff044: vmovd  %xmm2,%r10d
  0.44%  │  0x00007fb6a0bff049: vphaddd %ymm4,%ymm4,%ymm1
  0.15%  │  0x00007fb6a0bff04e: vphaddd %ymm0,%ymm1,%ymm1
  1.56%  │  0x00007fb6a0bff053: vextracti128 $0x1,%ymm1,%xmm0
  1.29%  │  0x00007fb6a0bff059: vpaddd %xmm0,%xmm1,%xmm1
  0.60%  │  0x00007fb6a0bff05d: vmovd  %r10d,%xmm0
         │  0x00007fb6a0bff062: vpaddd %xmm1,%xmm0,%xmm0
  0.69%  │  0x00007fb6a0bff066: vmovd  %xmm0,%r10d
  0.56%  │  0x00007fb6a0bff06b: vphaddd %ymm3,%ymm3,%ymm0
  0.06%  │  0x00007fb6a0bff070: vphaddd %ymm2,%ymm0,%ymm0
  0.48%  │  0x00007fb6a0bff075: vextracti128 $0x1,%ymm0,%xmm2
  0.95%  │  0x00007fb6a0bff07b: vpaddd %xmm2,%xmm0,%xmm0
  0.66%  │  0x00007fb6a0bff07f: vmovd  %r10d,%xmm2
         │  0x00007fb6a0bff084: vpaddd %xmm0,%xmm2,%xmm2
  0.68%  │  0x00007fb6a0bff088: vmovd  %xmm2,%r10d
  0.58%  │  0x00007fb6a0bff08d: vphaddd %ymm7,%ymm7,%ymm0
         │  0x00007fb6a0bff092: vphaddd %ymm2,%ymm0,%ymm0
  1.23%  │  0x00007fb6a0bff097: vextracti128 $0x1,%ymm0,%xmm2
  2.03%  │  0x00007fb6a0bff09d: vpaddd %xmm2,%xmm0,%xmm0
  0.50%  │  0x00007fb6a0bff0a1: vmovd  %r10d,%xmm2
         │  0x00007fb6a0bff0a6: vpaddd %xmm0,%xmm2,%xmm2
  0.69%  │  0x00007fb6a0bff0aa: vmovd  %xmm2,%r10d
  0.68%  │  0x00007fb6a0bff0af: vmovdqu 0xe8(%rsp),%ymm1
         │  0x00007fb6a0bff0b8: vphaddd %ymm1,%ymm1,%ymm0
  0.58%  │  0x00007fb6a0bff0bd: vphaddd %ymm2,%ymm0,%ymm0
  0.71%  │  0x00007fb6a0bff0c2: vextracti128 $0x1,%ymm0,%xmm2
  1.93%  │  0x00007fb6a0bff0c8: vpaddd %xmm2,%xmm0,%xmm0
  0.50%  │  0x00007fb6a0bff0cc: vmovd  %r10d,%xmm2
         │  0x00007fb6a0bff0d1: vpaddd %xmm0,%xmm2,%xmm2
  0.58%  │  0x00007fb6a0bff0d5: vmovd  %xmm2,%r10d
  0.56%  │  0x00007fb6a0bff0da: vmovdqu 0xc8(%rsp),%ymm1
         │  0x00007fb6a0bff0e3: vphaddd %ymm1,%ymm1,%ymm0
  0.64%  │  0x00007fb6a0bff0e8: vphaddd %ymm2,%ymm0,%ymm0
  0.37%  │  0x00007fb6a0bff0ed: vextracti128 $0x1,%ymm0,%xmm2
  1.87%  │  0x00007fb6a0bff0f3: vpaddd %xmm2,%xmm0,%xmm0
  0.64%  │  0x00007fb6a0bff0f7: vmovd  %r10d,%xmm2
         │  0x00007fb6a0bff0fc: vpaddd %xmm0,%xmm2,%xmm2
  0.56%  │  0x00007fb6a0bff100: vmovd  %xmm2,%r10d
  0.69%  │  0x00007fb6a0bff105: vmovdqu 0xa8(%rsp),%ymm1
         │  0x00007fb6a0bff10e: vphaddd %ymm1,%ymm1,%ymm0
  0.60%  │  0x00007fb6a0bff113: vphaddd %ymm5,%ymm0,%ymm0
         │  0x00007fb6a0bff118: vextracti128 $0x1,%ymm0,%xmm5
  0.64%  │  0x00007fb6a0bff11e: vpaddd %xmm5,%xmm0,%xmm0
         │  0x00007fb6a0bff122: vmovd  %r10d,%xmm5
         │  0x00007fb6a0bff127: vpaddd %xmm0,%xmm5,%xmm5
         │  0x00007fb6a0bff12b: vmovd  %xmm5,%r10d
  0.69%  │  0x00007fb6a0bff130: vmovdqu 0x88(%rsp),%ymm0
         │  0x00007fb6a0bff139: vphaddd %ymm0,%ymm0,%ymm1
  0.60%  │  0x00007fb6a0bff13e: vphaddd %ymm10,%ymm1,%ymm1  ;   {no_reloc}
         │  0x00007fb6a0bff143: vextracti128 $0x1,%ymm1,%xmm10
  1.87%  │  0x00007fb6a0bff149: vpaddd %xmm10,%xmm1,%xmm1
  0.64%  │  0x00007fb6a0bff14e: vmovd  %r10d,%xmm10
         │  0x00007fb6a0bff153: vpaddd %xmm1,%xmm10,%xmm10
  0.58%  │  0x00007fb6a0bff157: vmovd  %xmm10,%r10d
  0.50%  │  0x00007fb6a0bff15c: vmovdqu 0x68(%rsp),%ymm0
         │  0x00007fb6a0bff162: vphaddd %ymm0,%ymm0,%ymm14
  0.60%  │  0x00007fb6a0bff167: vphaddd %ymm11,%ymm14,%ymm14
  1.20%  │  0x00007fb6a0bff16c: vextracti128 $0x1,%ymm14,%xmm11
  1.91%  │  0x00007fb6a0bff172: vpaddd %xmm11,%xmm14,%xmm14
  0.71%  │  0x00007fb6a0bff177: vmovd  %r10d,%xmm11
         │  0x00007fb6a0bff17c: vpaddd %xmm14,%xmm11,%xmm11
  0.85%  │  0x00007fb6a0bff181: vmovd  %xmm11,%r10d
  0.48%  │  0x00007fb6a0bff186: vmovdqu 0x48(%rsp),%ymm1
         │  0x00007fb6a0bff18c: vphaddd %ymm1,%ymm1,%ymm0
  0.68%  │  0x00007fb6a0bff191: vphaddd %ymm2,%ymm0,%ymm0
  0.15%  │  0x00007fb6a0bff196: vextracti128 $0x1,%ymm0,%xmm2
  1.27%  │  0x00007fb6a0bff19c: vpaddd %xmm2,%xmm0,%xmm0
  0.42%  │  0x00007fb6a0bff1a0: vmovd  %r10d,%xmm2
         │  0x00007fb6a0bff1a5: vpaddd %xmm0,%xmm2,%xmm2
  0.44%  │  0x00007fb6a0bff1a9: vmovd  %xmm2,%r10d
  0.69%  │  0x00007fb6a0bff1ae: vmovdqu 0x28(%rsp),%ymm0
         │  0x00007fb6a0bff1b4: vphaddd %ymm0,%ymm0,%ymm2
  0.56%  │  0x00007fb6a0bff1b9: vphaddd %ymm13,%ymm2,%ymm2
         │  0x00007fb6a0bff1be: vextracti128 $0x1,%ymm2,%xmm13
  1.47%  │  0x00007fb6a0bff1c4: vpaddd %xmm13,%xmm2,%xmm2
  0.68%  │  0x00007fb6a0bff1c9: vmovd  %r10d,%xmm13
         │  0x00007fb6a0bff1ce: vpaddd %xmm2,%xmm13,%xmm13
  0.62%  │  0x00007fb6a0bff1d2: vmovd  %xmm13,%edx        ;*iadd {reexecute=0 rethrow=0 return_oop=0}
         │                                                ; - org.sample.DotProduct::dotProduct at 36 (line 32)
         │                                                ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
  0.48%  │  0x00007fb6a0bff1d6: add    $0x80,%ebx         ;*iinc {reexecute=0 rethrow=0 return_oop=0}
         │                                                ; - org.sample.DotProduct::dotProduct at 38 (line 31)
         │                                                ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
         │  0x00007fb6a0bff1dc: cmp    %esi,%ebx
         ╰  0x00007fb6a0bff1de: jl     0x00007fb6a0bfee20  ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
                                                          ; - org.sample.DotProduct::dotProduct at 10 (line 31)
                                                          ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
            0x00007fb6a0bff1e4: mov    %r9d,%r10d
            0x00007fb6a0bff1e7: add    $0xfffffff9,%r10d
  0.08%     0x00007fb6a0bff1eb: cmp    %r10d,%r9d
            0x00007fb6a0bff1ee: mov    $0x80000000,%r9d
            0x00007fb6a0bff1f4: cmovl  %r9d,%r10d
            0x00007fb6a0bff1f8: cmp    %r10d,%ebx
            0x00007fb6a0bff1fb: jl     0x00007fb6a0bfecb0
            0x00007fb6a0bff201: jmpq   0x00007fb6a0bfece6
            0x00007fb6a0bff206: xor    %edx,%edx
            0x00007fb6a0bff208: jmpq   0x00007fb6a0bfed09
            0x00007fb6a0bff20d: mov    $0x1,%ebp          ;*aload_1 {reexecute=0 rethrow=0 return_oop=0}
                                                          ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 36 (line 122)
            0x00007fb6a0bff212: vzeroupper 
....................................................................................................
 81.11%  <total for region 1>


More information about the hotspot-compiler-dev mailing list