RFR(S): 8200477: Integer dot product no longer autovectorised
Roland Westrelin
rwestrel at redhat.com
Mon Apr 23 16:18:57 UTC 2018
> I've tried to get this to work, both on AArch64 and x86, but without
> any success. No matter what I do I do not get any vector insns.
> Please tell me what you do to run this JMH test and dee any vector
> instructions. An example of what you see would be nice. Does it
> need AVX, or some special level of SSE?
java -XX:-UseCountedLoopSafepoints -jar target/benchmarks.jar -prof perfasm -wi 5 -i 5 -f 1 org.sample.DotProduct.dotProduct
should work whether you applied the patch for this RFR or not. See
below for a sample output.
Roland.
Hottest code regions (>10.00% "cycles" events):
....[Hottest Region 1]..............................................................................
c2, level 4, org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub, version 770 (961 bytes)
0x00007fb6a0bfee00: cmp %esi,%r9d
0x00007fb6a0bfee03: mov $0x80000000,%r10d
0.10% 0x00007fb6a0bfee09: cmovl %r10d,%esi
0x00007fb6a0bfee0d: cmp %esi,%ebx
0x00007fb6a0bfee0f: jge 0x00007fb6a0bfece6
0x00007fb6a0bfee15: data16 data16 nopw 0x0(%rax,%rax,1)
;*iload_2 {reexecute=0 rethrow=0 return_oop=0}
; - org.sample.DotProduct::dotProduct at 13 (line 31)
; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
↗ 0x00007fb6a0bfee20: vmovdqu 0x1f0(%r11,%rbx,4),%ymm0
0.10% │ 0x00007fb6a0bfee2a: vpmulld 0x1f0(%rdi,%rbx,4),%ymm0,%ymm0
0.83% │ 0x00007fb6a0bfee34: vmovdqu %ymm0,0x28(%rsp)
0.06% │ 0x00007fb6a0bfee3a: vmovdqu 0x1d0(%r11,%rbx,4),%ymm0
0.66% │ 0x00007fb6a0bfee44: vpmulld 0x1d0(%rdi,%rbx,4),%ymm0,%ymm0
0.12% │ 0x00007fb6a0bfee4e: vmovdqu %ymm0,0x48(%rsp)
0.42% │ 0x00007fb6a0bfee54: vmovdqu 0x1b0(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfee5e: vpmulld 0x1b0(%rdi,%rbx,4),%ymm0,%ymm0
0.83% │ 0x00007fb6a0bfee68: vmovdqu %ymm0,0x68(%rsp)
0.08% │ 0x00007fb6a0bfee6e: vmovdqu 0x190(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfee78: vpmulld 0x190(%rdi,%rbx,4),%ymm0,%ymm0
1.23% │ 0x00007fb6a0bfee82: vmovdqu %ymm0,0x88(%rsp)
0.14% │ 0x00007fb6a0bfee8b: vmovdqu 0x170(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfee95: vpmulld 0x170(%rdi,%rbx,4),%ymm0,%ymm0
1.64% │ 0x00007fb6a0bfee9f: vmovdqu %ymm0,0xa8(%rsp)
0.19% │ 0x00007fb6a0bfeea8: vmovdqu 0x150(%r11,%rbx,4),%ymm0
0.35% │ 0x00007fb6a0bfeeb2: vpmulld 0x150(%rdi,%rbx,4),%ymm0,%ymm0
0.25% │ 0x00007fb6a0bfeebc: vmovdqu %ymm0,0xc8(%rsp)
0.44% │ 0x00007fb6a0bfeec5: vmovdqu 0x130(%r11,%rbx,4),%ymm0
0.02% │ 0x00007fb6a0bfeecf: vpmulld 0x130(%rdi,%rbx,4),%ymm0,%ymm0
0.68% │ 0x00007fb6a0bfeed9: vmovdqu %ymm0,0xe8(%rsp)
0.10% │ 0x00007fb6a0bfeee2: vmovdqu 0x110(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfeeec: vpmulld 0x110(%rdi,%rbx,4),%ymm0,%ymm7
0.66% │ 0x00007fb6a0bfeef6: vmovdqu 0xf0(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfef00: vpmulld 0xf0(%rdi,%rbx,4),%ymm0,%ymm3
0.68% │ 0x00007fb6a0bfef0a: vmovdqu 0x10(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfef11: vpmulld 0x10(%rdi,%rbx,4),%ymm0,%ymm15
2.06% │ 0x00007fb6a0bfef18: vmovdqu 0x30(%r11,%rbx,4),%ymm0
0.02% │ 0x00007fb6a0bfef1f: vpmulld 0x30(%rdi,%rbx,4),%ymm0,%ymm1
0.81% │ 0x00007fb6a0bfef26: vmovdqu 0x50(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfef2d: vpmulld 0x50(%rdi,%rbx,4),%ymm0,%ymm12
0.89% │ 0x00007fb6a0bfef34: vmovdqu 0x70(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfef3b: vpmulld 0x70(%rdi,%rbx,4),%ymm0,%ymm9
0.91% │ 0x00007fb6a0bfef42: vmovdqu 0x90(%r11,%rbx,4),%ymm0
│ 0x00007fb6a0bfef4c: vpmulld 0x90(%rdi,%rbx,4),%ymm0,%ymm8
0.91% │ 0x00007fb6a0bfef56: vmovdqu 0xb0(%r11,%rbx,4),%ymm0
2.74% │ 0x00007fb6a0bfef60: vpmulld 0xb0(%rdi,%rbx,4),%ymm0,%ymm6
2.01% │ 0x00007fb6a0bfef6a: vmovdqu 0xd0(%r11,%rbx,4),%ymm0 ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.DotProduct::dotProduct at 34 (line 32)
│ ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
│ 0x00007fb6a0bfef74: vpmulld 0xd0(%rdi,%rbx,4),%ymm0,%ymm4
0.85% │ 0x00007fb6a0bfef7e: vphaddd %ymm15,%ymm15,%ymm0
0.04% │ 0x00007fb6a0bfef83: vphaddd %ymm2,%ymm0,%ymm0
0.50% │ 0x00007fb6a0bfef88: vextracti128 $0x1,%ymm0,%xmm2
│ 0x00007fb6a0bfef8e: vpaddd %xmm2,%xmm0,%xmm0
0.27% │ 0x00007fb6a0bfef92: vmovd %edx,%xmm2
│ 0x00007fb6a0bfef96: vpaddd %xmm0,%xmm2,%xmm2
0.06% │ 0x00007fb6a0bfef9a: vmovd %xmm2,%r10d
│ 0x00007fb6a0bfef9f: vphaddd %ymm1,%ymm1,%ymm0
0.60% │ 0x00007fb6a0bfefa4: vphaddd %ymm2,%ymm0,%ymm0
0.54% │ 0x00007fb6a0bfefa9: vextracti128 $0x1,%ymm0,%xmm2
0.83% │ 0x00007fb6a0bfefaf: vpaddd %xmm2,%xmm0,%xmm0
0.50% │ 0x00007fb6a0bfefb3: vmovd %r10d,%xmm2
0.14% │ 0x00007fb6a0bfefb8: vpaddd %xmm0,%xmm2,%xmm2
0.37% │ 0x00007fb6a0bfefbc: vmovd %xmm2,%r10d
0.60% │ 0x00007fb6a0bfefc1: vphaddd %ymm12,%ymm12,%ymm1
0.17% │ 0x00007fb6a0bfefc6: vphaddd %ymm0,%ymm1,%ymm1
0.64% │ 0x00007fb6a0bfefcb: vextracti128 $0x1,%ymm1,%xmm0
1.45% │ 0x00007fb6a0bfefd1: vpaddd %xmm0,%xmm1,%xmm1
0.50% │ 0x00007fb6a0bfefd5: vmovd %r10d,%xmm0
│ 0x00007fb6a0bfefda: vpaddd %xmm1,%xmm0,%xmm0
0.42% │ 0x00007fb6a0bfefde: vmovd %xmm0,%r10d
0.50% │ 0x00007fb6a0bfefe3: vphaddd %ymm9,%ymm9,%ymm0
0.21% │ 0x00007fb6a0bfefe8: vphaddd %ymm2,%ymm0,%ymm0
0.42% │ 0x00007fb6a0bfefed: vextracti128 $0x1,%ymm0,%xmm2
0.91% │ 0x00007fb6a0bfeff3: vpaddd %xmm2,%xmm0,%xmm0
0.25% │ 0x00007fb6a0bfeff7: vmovd %r10d,%xmm2
0.08% │ 0x00007fb6a0bfeffc: vpaddd %xmm0,%xmm2,%xmm2
0.54% │ 0x00007fb6a0bff000: vmovd %xmm2,%r10d
0.56% │ 0x00007fb6a0bff005: vphaddd %ymm8,%ymm8,%ymm1
0.12% │ 0x00007fb6a0bff00a: vphaddd %ymm0,%ymm1,%ymm1
0.73% │ 0x00007fb6a0bff00f: vextracti128 $0x1,%ymm1,%xmm0
1.02% │ 0x00007fb6a0bff015: vpaddd %xmm0,%xmm1,%xmm1
0.56% │ 0x00007fb6a0bff019: vmovd %r10d,%xmm0
│ 0x00007fb6a0bff01e: vpaddd %xmm1,%xmm0,%xmm0
0.69% │ 0x00007fb6a0bff022: vmovd %xmm0,%r10d
0.35% │ 0x00007fb6a0bff027: vphaddd %ymm6,%ymm6,%ymm0
0.15% │ 0x00007fb6a0bff02c: vphaddd %ymm2,%ymm0,%ymm0
0.56% │ 0x00007fb6a0bff031: vextracti128 $0x1,%ymm0,%xmm2
1.10% │ 0x00007fb6a0bff037: vpaddd %xmm2,%xmm0,%xmm0
0.58% │ 0x00007fb6a0bff03b: vmovd %r10d,%xmm2
0.15% │ 0x00007fb6a0bff040: vpaddd %xmm0,%xmm2,%xmm2
0.50% │ 0x00007fb6a0bff044: vmovd %xmm2,%r10d
0.44% │ 0x00007fb6a0bff049: vphaddd %ymm4,%ymm4,%ymm1
0.15% │ 0x00007fb6a0bff04e: vphaddd %ymm0,%ymm1,%ymm1
1.56% │ 0x00007fb6a0bff053: vextracti128 $0x1,%ymm1,%xmm0
1.29% │ 0x00007fb6a0bff059: vpaddd %xmm0,%xmm1,%xmm1
0.60% │ 0x00007fb6a0bff05d: vmovd %r10d,%xmm0
│ 0x00007fb6a0bff062: vpaddd %xmm1,%xmm0,%xmm0
0.69% │ 0x00007fb6a0bff066: vmovd %xmm0,%r10d
0.56% │ 0x00007fb6a0bff06b: vphaddd %ymm3,%ymm3,%ymm0
0.06% │ 0x00007fb6a0bff070: vphaddd %ymm2,%ymm0,%ymm0
0.48% │ 0x00007fb6a0bff075: vextracti128 $0x1,%ymm0,%xmm2
0.95% │ 0x00007fb6a0bff07b: vpaddd %xmm2,%xmm0,%xmm0
0.66% │ 0x00007fb6a0bff07f: vmovd %r10d,%xmm2
│ 0x00007fb6a0bff084: vpaddd %xmm0,%xmm2,%xmm2
0.68% │ 0x00007fb6a0bff088: vmovd %xmm2,%r10d
0.58% │ 0x00007fb6a0bff08d: vphaddd %ymm7,%ymm7,%ymm0
│ 0x00007fb6a0bff092: vphaddd %ymm2,%ymm0,%ymm0
1.23% │ 0x00007fb6a0bff097: vextracti128 $0x1,%ymm0,%xmm2
2.03% │ 0x00007fb6a0bff09d: vpaddd %xmm2,%xmm0,%xmm0
0.50% │ 0x00007fb6a0bff0a1: vmovd %r10d,%xmm2
│ 0x00007fb6a0bff0a6: vpaddd %xmm0,%xmm2,%xmm2
0.69% │ 0x00007fb6a0bff0aa: vmovd %xmm2,%r10d
0.68% │ 0x00007fb6a0bff0af: vmovdqu 0xe8(%rsp),%ymm1
│ 0x00007fb6a0bff0b8: vphaddd %ymm1,%ymm1,%ymm0
0.58% │ 0x00007fb6a0bff0bd: vphaddd %ymm2,%ymm0,%ymm0
0.71% │ 0x00007fb6a0bff0c2: vextracti128 $0x1,%ymm0,%xmm2
1.93% │ 0x00007fb6a0bff0c8: vpaddd %xmm2,%xmm0,%xmm0
0.50% │ 0x00007fb6a0bff0cc: vmovd %r10d,%xmm2
│ 0x00007fb6a0bff0d1: vpaddd %xmm0,%xmm2,%xmm2
0.58% │ 0x00007fb6a0bff0d5: vmovd %xmm2,%r10d
0.56% │ 0x00007fb6a0bff0da: vmovdqu 0xc8(%rsp),%ymm1
│ 0x00007fb6a0bff0e3: vphaddd %ymm1,%ymm1,%ymm0
0.64% │ 0x00007fb6a0bff0e8: vphaddd %ymm2,%ymm0,%ymm0
0.37% │ 0x00007fb6a0bff0ed: vextracti128 $0x1,%ymm0,%xmm2
1.87% │ 0x00007fb6a0bff0f3: vpaddd %xmm2,%xmm0,%xmm0
0.64% │ 0x00007fb6a0bff0f7: vmovd %r10d,%xmm2
│ 0x00007fb6a0bff0fc: vpaddd %xmm0,%xmm2,%xmm2
0.56% │ 0x00007fb6a0bff100: vmovd %xmm2,%r10d
0.69% │ 0x00007fb6a0bff105: vmovdqu 0xa8(%rsp),%ymm1
│ 0x00007fb6a0bff10e: vphaddd %ymm1,%ymm1,%ymm0
0.60% │ 0x00007fb6a0bff113: vphaddd %ymm5,%ymm0,%ymm0
│ 0x00007fb6a0bff118: vextracti128 $0x1,%ymm0,%xmm5
0.64% │ 0x00007fb6a0bff11e: vpaddd %xmm5,%xmm0,%xmm0
│ 0x00007fb6a0bff122: vmovd %r10d,%xmm5
│ 0x00007fb6a0bff127: vpaddd %xmm0,%xmm5,%xmm5
│ 0x00007fb6a0bff12b: vmovd %xmm5,%r10d
0.69% │ 0x00007fb6a0bff130: vmovdqu 0x88(%rsp),%ymm0
│ 0x00007fb6a0bff139: vphaddd %ymm0,%ymm0,%ymm1
0.60% │ 0x00007fb6a0bff13e: vphaddd %ymm10,%ymm1,%ymm1 ; {no_reloc}
│ 0x00007fb6a0bff143: vextracti128 $0x1,%ymm1,%xmm10
1.87% │ 0x00007fb6a0bff149: vpaddd %xmm10,%xmm1,%xmm1
0.64% │ 0x00007fb6a0bff14e: vmovd %r10d,%xmm10
│ 0x00007fb6a0bff153: vpaddd %xmm1,%xmm10,%xmm10
0.58% │ 0x00007fb6a0bff157: vmovd %xmm10,%r10d
0.50% │ 0x00007fb6a0bff15c: vmovdqu 0x68(%rsp),%ymm0
│ 0x00007fb6a0bff162: vphaddd %ymm0,%ymm0,%ymm14
0.60% │ 0x00007fb6a0bff167: vphaddd %ymm11,%ymm14,%ymm14
1.20% │ 0x00007fb6a0bff16c: vextracti128 $0x1,%ymm14,%xmm11
1.91% │ 0x00007fb6a0bff172: vpaddd %xmm11,%xmm14,%xmm14
0.71% │ 0x00007fb6a0bff177: vmovd %r10d,%xmm11
│ 0x00007fb6a0bff17c: vpaddd %xmm14,%xmm11,%xmm11
0.85% │ 0x00007fb6a0bff181: vmovd %xmm11,%r10d
0.48% │ 0x00007fb6a0bff186: vmovdqu 0x48(%rsp),%ymm1
│ 0x00007fb6a0bff18c: vphaddd %ymm1,%ymm1,%ymm0
0.68% │ 0x00007fb6a0bff191: vphaddd %ymm2,%ymm0,%ymm0
0.15% │ 0x00007fb6a0bff196: vextracti128 $0x1,%ymm0,%xmm2
1.27% │ 0x00007fb6a0bff19c: vpaddd %xmm2,%xmm0,%xmm0
0.42% │ 0x00007fb6a0bff1a0: vmovd %r10d,%xmm2
│ 0x00007fb6a0bff1a5: vpaddd %xmm0,%xmm2,%xmm2
0.44% │ 0x00007fb6a0bff1a9: vmovd %xmm2,%r10d
0.69% │ 0x00007fb6a0bff1ae: vmovdqu 0x28(%rsp),%ymm0
│ 0x00007fb6a0bff1b4: vphaddd %ymm0,%ymm0,%ymm2
0.56% │ 0x00007fb6a0bff1b9: vphaddd %ymm13,%ymm2,%ymm2
│ 0x00007fb6a0bff1be: vextracti128 $0x1,%ymm2,%xmm13
1.47% │ 0x00007fb6a0bff1c4: vpaddd %xmm13,%xmm2,%xmm2
0.68% │ 0x00007fb6a0bff1c9: vmovd %r10d,%xmm13
│ 0x00007fb6a0bff1ce: vpaddd %xmm2,%xmm13,%xmm13
0.62% │ 0x00007fb6a0bff1d2: vmovd %xmm13,%edx ;*iadd {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.DotProduct::dotProduct at 36 (line 32)
│ ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
0.48% │ 0x00007fb6a0bff1d6: add $0x80,%ebx ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.DotProduct::dotProduct at 38 (line 31)
│ ; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
│ 0x00007fb6a0bff1dc: cmp %esi,%ebx
╰ 0x00007fb6a0bff1de: jl 0x00007fb6a0bfee20 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
; - org.sample.DotProduct::dotProduct at 10 (line 31)
; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 17 (line 119)
0x00007fb6a0bff1e4: mov %r9d,%r10d
0x00007fb6a0bff1e7: add $0xfffffff9,%r10d
0.08% 0x00007fb6a0bff1eb: cmp %r10d,%r9d
0x00007fb6a0bff1ee: mov $0x80000000,%r9d
0x00007fb6a0bff1f4: cmovl %r9d,%r10d
0x00007fb6a0bff1f8: cmp %r10d,%ebx
0x00007fb6a0bff1fb: jl 0x00007fb6a0bfecb0
0x00007fb6a0bff201: jmpq 0x00007fb6a0bfece6
0x00007fb6a0bff206: xor %edx,%edx
0x00007fb6a0bff208: jmpq 0x00007fb6a0bfed09
0x00007fb6a0bff20d: mov $0x1,%ebp ;*aload_1 {reexecute=0 rethrow=0 return_oop=0}
; - org.sample.generated.DotProduct_dotProduct_jmhTest::dotProduct_thrpt_jmhStub at 36 (line 122)
0x00007fb6a0bff212: vzeroupper
....................................................................................................
81.11% <total for region 1>
More information about the hotspot-compiler-dev
mailing list