RFR: 8256488: [aarch64] Use ldpq/stpq instead of ld4/st4 for small copies in StubGenerator::copy_memory

Eugene Astigeevich github.com+42899633+eastig at openjdk.java.net
Mon Nov 23 21:07:05 UTC 2020


On Sun, 22 Nov 2020 19:56:50 GMT, Eugene Astigeevich <github.com+42899633+eastig at openjdk.org> wrote:

>>> Thank you! Please allow for a few business days to verify that your employer has signed the OCA. Also, please note that pull requests that are pending an OCA check will not usually be evaluated, so your patience is appreciated!
>> 
>> Evegeny is part of the Amazon Corretto team and covered by Amazons OCA.
>
> JMH microbenchmark results for testByte:
> |Benchmark|Length|Count|Units|ld4 vs simd_off|ldpq vs simd_off|ldpq vs ld4|Maximum Relative Error|
> |-|-|-|-|-|-|-|-|
> |ArrayCopyAligned.testByte|65|25|ns/op|49.34%|-1.91%|-34.32%|0.37%|
> |ArrayCopyAligned.testByte|66|25|ns/op|49.18%|-1.95%|-34.28%|0.27%|
> |ArrayCopyAligned.testByte|67|25|ns/op|49.29%|-1.82%|-34.24%|0.38%|
> |ArrayCopyAligned.testByte|68|25|ns/op|51.10%|-2.61%|-35.55%|0.59%|
> |ArrayCopyAligned.testByte|69|25|ns/op|49.22%|-1.82%|-34.21%|0.36%|
> |ArrayCopyAligned.testByte|70|25|ns/op|49.38%|-1.72%|-34.21%|0.38%|
> |ArrayCopyAligned.testByte|71|25|ns/op|49.34%|-2.06%|-34.42%|0.30%|
> |ArrayCopyAligned.testByte|72|25|ns/op|50.97%|-2.78%|-35.60%|0.65%|
> |ArrayCopyAligned.testByte|73|25|ns/op|50.01%|-1.62%|-34.42%|0.37%|
> |ArrayCopyAligned.testByte|74|25|ns/op|49.81%|-1.84%|-34.48%|0.35%|
> |ArrayCopyAligned.testByte|75|25|ns/op|49.85%|-1.86%|-34.51%|0.35%|
> |ArrayCopyAligned.testByte|76|25|ns/op|51.33%|-2.54%|-35.60%|0.59%|
> |ArrayCopyAligned.testByte|77|25|ns/op|49.72%|-1.81%|-34.42%|0.41%|
> |ArrayCopyAligned.testByte|78|25|ns/op|49.87%|-1.74%|-34.44%|0.37%|
> |ArrayCopyAligned.testByte|79|25|ns/op|49.67%|-1.91%|-34.47%|0.46%|
> |ArrayCopyAligned.testByte|80|25|ns/op|51.35%|-2.77%|-35.76%|0.65%|
> |ArrayCopyAligned.testByte|81|25|ns/op|8.70%|-29.07%|-34.75%|0.35%|
> |ArrayCopyAligned.testByte|82|25|ns/op|13.64%|-25.96%|-34.85%|0.44%|
> |ArrayCopyAligned.testByte|83|25|ns/op|12.04%|-26.80%|-34.66%|0.37%|
> |ArrayCopyAligned.testByte|84|25|ns/op|13.63%|-26.54%|-35.35%|0.46%|
> |ArrayCopyAligned.testByte|85|25|ns/op|11.52%|-27.18%|-34.71%|0.52%|
> |ArrayCopyAligned.testByte|86|25|ns/op|11.59%|-27.15%|-34.71%|0.29%|
> |ArrayCopyAligned.testByte|87|25|ns/op|10.47%|-27.82%|-34.66%|0.29%|
> |ArrayCopyAligned.testByte|88|25|ns/op|8.69%|-29.65%|-35.27%|0.20%|
> |ArrayCopyAligned.testByte|89|25|ns/op|8.70%|-28.86%|-34.56%|0.66%|
> |ArrayCopyAligned.testByte|90|25|ns/op|13.01%|-26.28%|-34.77%|0.28%|
> |ArrayCopyAligned.testByte|91|25|ns/op|10.96%|-27.62%|-34.77%|0.34%|
> |ArrayCopyAligned.testByte|92|25|ns/op|13.26%|-26.76%|-35.33%|0.32%|
> |ArrayCopyAligned.testByte|93|25|ns/op|10.67%|-27.61%|-34.59%|0.63%|
> |ArrayCopyAligned.testByte|94|25|ns/op|11.05%|-27.62%|-34.83%|0.33%|
> |ArrayCopyAligned.testByte|95|25|ns/op|6.69%|-30.16%|-34.54%|0.61%|
> |ArrayCopyAligned.testByte|96|25|ns/op|8.70%|-30.14%|-35.73%|0.23%|
> |ArrayCopyUnalignedBoth.testByte|65|25|ns/op|37.93%|2.64%|-25.59%|0.92%|
> |ArrayCopyUnalignedBoth.testByte|66|25|ns/op|37.58%|-1.15%|-28.15%|0.57%|
> |ArrayCopyUnalignedBoth.testByte|67|25|ns/op|39.73%|7.31%|-23.20%|1.03%|
> |ArrayCopyUnalignedBoth.testByte|68|25|ns/op|37.07%|3.08%|-24.80%|0.88%|
> |ArrayCopyUnalignedBoth.testByte|69|25|ns/op|37.80%|3.15%|-25.15%|1.16%|
> |ArrayCopyUnalignedBoth.testByte|70|25|ns/op|37.48%|-1.18%|-28.12%|0.74%|
> |ArrayCopyUnalignedBoth.testByte|71|25|ns/op|39.83%|7.74%|-22.95%|1.00%|
> |ArrayCopyUnalignedBoth.testByte|72|25|ns/op|37.29%|3.87%|-24.34%|1.03%|
> |ArrayCopyUnalignedBoth.testByte|73|25|ns/op|37.71%|3.00%|-25.21%|0.89%|
> |ArrayCopyUnalignedBoth.testByte|74|25|ns/op|37.51%|-1.04%|-28.03%|0.79%|
> |ArrayCopyUnalignedBoth.testByte|75|25|ns/op|39.83%|7.33%|-23.24%|1.05%|
> |ArrayCopyUnalignedBoth.testByte|76|25|ns/op|37.47%|3.41%|-24.78%|0.97%|
> |ArrayCopyUnalignedBoth.testByte|77|25|ns/op|37.59%|3.71%|-24.63%|0.96%|
> |ArrayCopyUnalignedBoth.testByte|78|25|ns/op|39.23%|-5.11%|-31.84%|0.18%|
> |ArrayCopyUnalignedBoth.testByte|79|25|ns/op|40.30%|-5.81%|-32.86%|0.19%|
> |ArrayCopyUnalignedBoth.testByte|80|25|ns/op|37.41%|-4.85%|-30.75%|0.22%|
> |ArrayCopyUnalignedBoth.testByte|81|25|ns/op|-3.82%|-33.50%|-30.86%|0.17%|
> |ArrayCopyUnalignedBoth.testByte|82|25|ns/op|-4.27%|-34.19%|-31.26%|0.23%|
> |ArrayCopyUnalignedBoth.testByte|83|25|ns/op|-3.83%|-34.43%|-31.82%|0.23%|
> |ArrayCopyUnalignedBoth.testByte|84|25|ns/op|-4.29%|-33.78%|-30.81%|0.14%|
> |ArrayCopyUnalignedBoth.testByte|85|25|ns/op|-4.13%|-33.67%|-30.82%|0.15%|
> |ArrayCopyUnalignedBoth.testByte|86|25|ns/op|-7.46%|-36.44%|-31.31%|0.28%|
> |ArrayCopyUnalignedBoth.testByte|87|25|ns/op|-3.85%|-34.39%|-31.76%|0.18%|
> |ArrayCopyUnalignedBoth.testByte|88|25|ns/op|-4.30%|-33.77%|-30.79%|0.19%|
> |ArrayCopyUnalignedBoth.testByte|89|25|ns/op|-4.12%|-33.74%|-30.90%|0.16%|
> |ArrayCopyUnalignedBoth.testByte|90|25|ns/op|-7.51%|-36.41%|-31.25%|0.63%|
> |ArrayCopyUnalignedBoth.testByte|91|25|ns/op|-4.19%|-34.64%|-31.77%|0.16%|
> |ArrayCopyUnalignedBoth.testByte|92|25|ns/op|-7.45%|-35.98%|-30.83%|0.42%|
> |ArrayCopyUnalignedBoth.testByte|93|25|ns/op|-7.21%|-35.76%|-30.76%|0.47%|
> |ArrayCopyUnalignedBoth.testByte|94|25|ns/op|-9.69%|-38.64%|-32.05%|0.38%|
> |ArrayCopyUnalignedBoth.testByte|95|25|ns/op|-3.85%|-35.64%|-33.06%|0.37%|
> |ArrayCopyUnalignedBoth.testByte|96|25|ns/op|-4.89%|-34.30%|-30.93%|0.25%|
> |ArrayCopyUnalignedDst.testByte|65|25|ns/op|48.48%|18.07%|-20.48%|1.29%|
> |ArrayCopyUnalignedDst.testByte|66|25|ns/op|48.79%|17.99%|-20.70%|1.58%|
> |ArrayCopyUnalignedDst.testByte|67|25|ns/op|49.03%|16.96%|-21.52%|3.64%|
> |ArrayCopyUnalignedDst.testByte|68|25|ns/op|49.99%|23.55%|-17.63%|0.97%|
> |ArrayCopyUnalignedDst.testByte|69|25|ns/op|49.03%|22.42%|-17.86%|1.33%|
> |ArrayCopyUnalignedDst.testByte|70|25|ns/op|49.19%|22.68%|-17.77%|1.23%|
> |ArrayCopyUnalignedDst.testByte|71|25|ns/op|48.99%|16.72%|-21.66%|3.30%|
> |ArrayCopyUnalignedDst.testByte|72|25|ns/op|50.08%|24.67%|-16.93%|1.02%|
> |ArrayCopyUnalignedDst.testByte|73|25|ns/op|49.69%|22.92%|-17.88%|1.29%|
> |ArrayCopyUnalignedDst.testByte|74|25|ns/op|49.57%|23.24%|-17.60%|1.14%|
> |ArrayCopyUnalignedDst.testByte|75|25|ns/op|49.84%|18.77%|-20.74%|3.32%|
> |ArrayCopyUnalignedDst.testByte|76|25|ns/op|50.06%|24.72%|-16.89%|1.09%|
> |ArrayCopyUnalignedDst.testByte|77|25|ns/op|49.70%|23.13%|-17.75%|1.24%|
> |ArrayCopyUnalignedDst.testByte|78|25|ns/op|49.70%|23.31%|-17.63%|1.37%|
> |ArrayCopyUnalignedDst.testByte|79|25|ns/op|49.83%|-2.56%|-34.97%|0.55%|
> |ArrayCopyUnalignedDst.testByte|80|25|ns/op|49.84%|-3.07%|-35.31%|0.27%|
> |ArrayCopyUnalignedDst.testByte|81|25|ns/op|8.70%|-28.50%|-34.22%|0.20%|
> |ArrayCopyUnalignedDst.testByte|82|25|ns/op|13.63%|-24.95%|-33.95%|0.48%|
> |ArrayCopyUnalignedDst.testByte|83|25|ns/op|12.38%|-26.46%|-34.56%|0.25%|
> |ArrayCopyUnalignedDst.testByte|84|25|ns/op|13.63%|-26.45%|-35.27%|0.39%|
> |ArrayCopyUnalignedDst.testByte|85|25|ns/op|10.67%|-27.24%|-34.26%|0.23%|
> |ArrayCopyUnalignedDst.testByte|86|25|ns/op|11.70%|-26.56%|-34.25%|0.20%|
> |ArrayCopyUnalignedDst.testByte|87|25|ns/op|10.51%|-27.65%|-34.53%|0.27%|
> |ArrayCopyUnalignedDst.testByte|88|25|ns/op|8.69%|-29.76%|-35.38%|0.17%|
> |ArrayCopyUnalignedDst.testByte|89|25|ns/op|8.69%|-28.64%|-34.35%|0.24%|
> |ArrayCopyUnalignedDst.testByte|90|25|ns/op|13.03%|-25.69%|-34.25%|0.26%|
> |ArrayCopyUnalignedDst.testByte|91|25|ns/op|11.09%|-27.20%|-34.47%|0.26%|
> |ArrayCopyUnalignedDst.testByte|92|25|ns/op|13.46%|-26.68%|-35.38%|0.20%|
> |ArrayCopyUnalignedDst.testByte|93|25|ns/op|10.75%|-27.34%|-34.39%|0.22%|
> |ArrayCopyUnalignedDst.testByte|94|25|ns/op|11.07%|-27.00%|-34.27%|0.27%|
> |ArrayCopyUnalignedDst.testByte|95|25|ns/op|6.67%|-30.77%|-35.11%|0.25%|
> |ArrayCopyUnalignedDst.testByte|96|25|ns/op|8.70%|-30.01%|-35.61%|0.17%|
> |ArrayCopyUnalignedSrc.testByte|65|25|ns/op|38.80%|-4.97%|-31.53%|0.15%|
> |ArrayCopyUnalignedSrc.testByte|66|25|ns/op|38.86%|-4.86%|-31.49%|0.16%|
> |ArrayCopyUnalignedSrc.testByte|67|25|ns/op|41.44%|-5.85%|-33.44%|0.48%|
> |ArrayCopyUnalignedSrc.testByte|68|25|ns/op|40.06%|-4.59%|-31.88%|0.16%|
> |ArrayCopyUnalignedSrc.testByte|69|25|ns/op|38.98%|-4.64%|-31.39%|0.29%|
> |ArrayCopyUnalignedSrc.testByte|70|25|ns/op|39.00%|-4.60%|-31.37%|0.26%|
> |ArrayCopyUnalignedSrc.testByte|71|25|ns/op|41.20%|-5.49%|-33.07%|0.27%|
> |ArrayCopyUnalignedSrc.testByte|72|25|ns/op|40.06%|-4.56%|-31.86%|0.21%|
> |ArrayCopyUnalignedSrc.testByte|73|25|ns/op|38.57%|-4.92%|-31.38%|0.19%|
> |ArrayCopyUnalignedSrc.testByte|74|25|ns/op|38.70%|-4.83%|-31.38%|0.25%|
> |ArrayCopyUnalignedSrc.testByte|75|25|ns/op|41.26%|-5.52%|-33.12%|0.18%|
> |ArrayCopyUnalignedSrc.testByte|76|25|ns/op|39.51%|-4.77%|-31.74%|0.20%|
> |ArrayCopyUnalignedSrc.testByte|77|25|ns/op|38.54%|-4.81%|-31.29%|0.32%|
> |ArrayCopyUnalignedSrc.testByte|78|25|ns/op|38.29%|-5.12%|-31.39%|0.22%|
> |ArrayCopyUnalignedSrc.testByte|79|25|ns/op|40.90%|-5.56%|-32.97%|0.33%|
> |ArrayCopyUnalignedSrc.testByte|80|25|ns/op|40.10%|-4.82%|-32.06%|0.22%|
> |ArrayCopyUnalignedSrc.testByte|81|25|ns/op|-3.84%|-34.15%|-31.52%|0.18%|
> |ArrayCopyUnalignedSrc.testByte|82|25|ns/op|-3.89%|-34.12%|-31.45%|0.28%|
> |ArrayCopyUnalignedSrc.testByte|83|25|ns/op|-3.85%|-36.20%|-33.64%|0.35%|
> |ArrayCopyUnalignedSrc.testByte|84|25|ns/op|-3.85%|-34.71%|-32.09%|0.34%|
> |ArrayCopyUnalignedSrc.testByte|85|25|ns/op|-3.83%|-34.11%|-31.49%|0.29%|
> |ArrayCopyUnalignedSrc.testByte|86|25|ns/op|-3.83%|-34.18%|-31.56%|0.38%|
> |ArrayCopyUnalignedSrc.testByte|87|25|ns/op|-3.84%|-36.04%|-33.48%|0.20%|
> |ArrayCopyUnalignedSrc.testByte|88|25|ns/op|-3.84%|-34.65%|-32.04%|0.15%|
> |ArrayCopyUnalignedSrc.testByte|89|25|ns/op|-3.84%|-34.03%|-31.39%|0.16%|
> |ArrayCopyUnalignedSrc.testByte|90|25|ns/op|-4.32%|-34.37%|-31.40%|0.19%|
> |ArrayCopyUnalignedSrc.testByte|91|25|ns/op|-3.84%|-36.08%|-33.52%|0.36%|
> |ArrayCopyUnalignedSrc.testByte|92|25|ns/op|-3.84%|-34.41%|-31.79%|0.38%|
> |ArrayCopyUnalignedSrc.testByte|93|25|ns/op|-3.85%|-34.04%|-31.40%|0.19%|
> |ArrayCopyUnalignedSrc.testByte|94|25|ns/op|-3.82%|-34.07%|-31.45%|0.20%|
> |ArrayCopyUnalignedSrc.testByte|95|25|ns/op|-3.84%|-36.01%|-33.45%|0.32%|
> |ArrayCopyUnalignedSrc.testByte|96|25|ns/op|-3.88%|-34.93%|-32.30%|0.32%|

JMH microbenchmark results for testChar:
|Benchmark|Length|Count|Units|ldpq vs ld4|Maximum Relative Error|
|-|-|-|-|-|-|
|ArrayCopyAligned.testChar|33|25|ns/op|-29.41%|0.73%|
|ArrayCopyAligned.testChar|34|25|ns/op|-30.14%|0.99%|
|ArrayCopyAligned.testChar|35|25|ns/op|-29.37%|0.44%|
|ArrayCopyAligned.testChar|36|25|ns/op|-29.85%|0.70%|
|ArrayCopyAligned.testChar|37|25|ns/op|-29.33%|0.65%|
|ArrayCopyAligned.testChar|38|25|ns/op|-29.69%|0.52%|
|ArrayCopyAligned.testChar|39|25|ns/op|-29.44%|0.79%|
|ArrayCopyAligned.testChar|40|25|ns/op|-29.82%|0.82%|
|ArrayCopyAligned.testChar|41|25|ns/op|-29.62%|0.74%|
|ArrayCopyAligned.testChar|42|25|ns/op|-29.88%|0.61%|
|ArrayCopyAligned.testChar|43|25|ns/op|-29.19%|0.64%|
|ArrayCopyAligned.testChar|44|25|ns/op|-29.89%|0.71%|
|ArrayCopyAligned.testChar|45|25|ns/op|-29.52%|0.80%|
|ArrayCopyAligned.testChar|46|25|ns/op|-29.71%|0.58%|
|ArrayCopyAligned.testChar|47|25|ns/op|-29.49%|0.71%|
|ArrayCopyAligned.testChar|48|25|ns/op|-29.89%|0.91%|
|ArrayCopyUnalignedBoth.testChar|33|25|ns/op|-29.04%|0.87%|
|ArrayCopyUnalignedBoth.testChar|34|25|ns/op|-29.21%|0.70%|
|ArrayCopyUnalignedBoth.testChar|35|25|ns/op|-27.70%|1.22%|
|ArrayCopyUnalignedBoth.testChar|36|25|ns/op|-28.68%|1.86%|
|ArrayCopyUnalignedBoth.testChar|37|25|ns/op|-27.81%|1.43%|
|ArrayCopyUnalignedBoth.testChar|38|25|ns/op|-29.54%|0.61%|
|ArrayCopyUnalignedBoth.testChar|39|25|ns/op|-29.89%|0.85%|
|ArrayCopyUnalignedBoth.testChar|40|25|ns/op|-30.97%|0.68%|
|ArrayCopyUnalignedBoth.testChar|41|25|ns/op|-29.96%|0.78%|
|ArrayCopyUnalignedBoth.testChar|42|25|ns/op|-30.79%|0.81%|
|ArrayCopyUnalignedBoth.testChar|43|25|ns/op|-29.57%|0.58%|
|ArrayCopyUnalignedBoth.testChar|44|25|ns/op|-31.02%|0.34%|
|ArrayCopyUnalignedBoth.testChar|45|25|ns/op|-30.05%|0.75%|
|ArrayCopyUnalignedBoth.testChar|46|25|ns/op|-30.56%|0.55%|
|ArrayCopyUnalignedBoth.testChar|47|25|ns/op|-30.39%|0.52%|
|ArrayCopyUnalignedBoth.testChar|48|25|ns/op|-30.94%|0.38%|
|ArrayCopyUnalignedDst.testChar|33|25|ns/op|-19.97%|1.08%|
|ArrayCopyUnalignedDst.testChar|34|25|ns/op|-16.05%|0.89%|
|ArrayCopyUnalignedDst.testChar|35|25|ns/op|-20.83%|1.26%|
|ArrayCopyUnalignedDst.testChar|36|25|ns/op|-16.09%|0.77%|
|ArrayCopyUnalignedDst.testChar|37|25|ns/op|-20.11%|1.24%|
|ArrayCopyUnalignedDst.testChar|38|25|ns/op|-15.26%|0.91%|
|ArrayCopyUnalignedDst.testChar|39|25|ns/op|-29.54%|0.56%|
|ArrayCopyUnalignedDst.testChar|40|25|ns/op|-29.53%|0.77%|
|ArrayCopyUnalignedDst.testChar|41|25|ns/op|-29.52%|0.87%|
|ArrayCopyUnalignedDst.testChar|42|25|ns/op|-29.45%|0.77%|
|ArrayCopyUnalignedDst.testChar|43|25|ns/op|-29.57%|1.06%|
|ArrayCopyUnalignedDst.testChar|44|25|ns/op|-29.69%|0.61%|
|ArrayCopyUnalignedDst.testChar|45|25|ns/op|-29.52%|0.83%|
|ArrayCopyUnalignedDst.testChar|46|25|ns/op|-29.31%|0.48%|
|ArrayCopyUnalignedDst.testChar|47|25|ns/op|-29.64%|0.50%|
|ArrayCopyUnalignedDst.testChar|48|25|ns/op|-29.75%|0.22%|
|ArrayCopyUnalignedSrc.testChar|33|25|ns/op|-29.33%|0.76%|
|ArrayCopyUnalignedSrc.testChar|34|25|ns/op|-30.11%|0.39%|
|ArrayCopyUnalignedSrc.testChar|35|25|ns/op|-29.54%|0.80%|
|ArrayCopyUnalignedSrc.testChar|36|25|ns/op|-30.07%|0.36%|
|ArrayCopyUnalignedSrc.testChar|37|25|ns/op|-29.41%|0.40%|
|ArrayCopyUnalignedSrc.testChar|38|25|ns/op|-29.95%|0.32%|
|ArrayCopyUnalignedSrc.testChar|39|25|ns/op|-29.39%|0.82%|
|ArrayCopyUnalignedSrc.testChar|40|25|ns/op|-29.85%|0.69%|
|ArrayCopyUnalignedSrc.testChar|41|25|ns/op|-28.93%|0.67%|
|ArrayCopyUnalignedSrc.testChar|42|25|ns/op|-29.50%|0.70%|
|ArrayCopyUnalignedSrc.testChar|43|25|ns/op|-28.95%|0.71%|
|ArrayCopyUnalignedSrc.testChar|44|25|ns/op|-29.75%|0.66%|
|ArrayCopyUnalignedSrc.testChar|45|25|ns/op|-29.02%|0.87%|
|ArrayCopyUnalignedSrc.testChar|46|25|ns/op|-29.76%|0.69%|
|ArrayCopyUnalignedSrc.testChar|47|25|ns/op|-29.37%|0.50%|
|ArrayCopyUnalignedSrc.testChar|48|25|ns/op|-29.71%|0.73%|

-------------

PR: https://git.openjdk.java.net/jdk/pull/1293


More information about the hotspot-compiler-dev mailing list