[vectorIntrinsics] RFR: 8262498: More than 50% performance degradation of pow operator due to call with svml intrinsic after JDK-8261267
Sandhya Viswanathan
sviswanathan at openjdk.java.net
Thu Mar 4 00:57:52 UTC 2021
On Sat, 27 Feb 2021 13:31:08 GMT, Jie Fu <jiefu at openjdk.org> wrote:
> Hi all,
>
> Performance of Vector API's pow operator has been decreased by more than 50% for micro benchmarks like:
> Double128Vector.POW
> Double256Vector.POW
> DoubleMaxVector.POW
> DoubleScalar.POW
> Float128Vector.POW
> Float256Vector.POW
> FloatMaxVector.POW
>
> Experiments show that svml's pow intrinsics are slow (except for the 512-bit ones).
> So only 512-bit vectors are allowed to be intrinsified with svml and others should be disabled.
>
> Here is the effect of this fix.
> Before | After
> ------------------------------------------------------------------------------------------------------------------------------------------
> Benchmark (size) Mode Cnt Score Error Units | Benchmark (size) Mode Cnt Score Error Units
> Double128Vector.POW 1024 thrpt 5 14.895 ? 0.070 ops/ms | Double128Vector.POW 1024 thrpt 5 31.897 ? 0.203 ops/ms
> Double256Vector.POW 1024 thrpt 5 15.650 ? 1.274 ops/ms | Double256Vector.POW 1024 thrpt 5 36.690 ? 2.848 ops/ms
> Double512Vector.POW 1024 thrpt 5 263.472 ? 0.062 ops/ms | Double512Vector.POW 1024 thrpt 5 261.681 ? 13.817 ops/ms
> Double64Vector.POW 1024 thrpt 5 17.881 ? 0.244 ops/ms | Double64Vector.POW 1024 thrpt 5 17.734 ? 0.184 ops/ms
> DoubleMaxVector.POW 1024 thrpt 5 263.613 ? 0.132 ops/ms | DoubleMaxVector.POW 1024 thrpt 5 263.085 ? 0.167 ops/ms
> DoubleScalar.POW 1024 thrpt 5 45.268 ? 0.043 ops/ms | DoubleScalar.POW 1024 thrpt 5 45.220 ? 0.013 ops/ms
> Float128Vector.POW 1024 thrpt 5 13.761 ? 0.092 ops/ms | Float128Vector.POW 1024 thrpt 5 28.578 ? 0.213 ops/ms
> Float256Vector.POW 1024 thrpt 5 13.131 ? 0.101 ops/ms | Float256Vector.POW 1024 thrpt 5 29.414 ? 0.370 ops/ms
> Float512Vector.POW 1024 thrpt 5 624.449 ? 267.160 ops/ms | Float512Vector.POW 1024 thrpt 5 649.519 ? 2.295 ops/ms
> Float64Vector.POW 1024 thrpt 5 10.888 ? 0.069 ops/ms | Float64Vector.POW 1024 thrpt 5 26.376 ? 0.601 ops/ms
> FloatMaxVector.POW 1024 thrpt 5 658.723 ? 2.445 ops/ms | FloatMaxVector.POW 1024 thrpt 5 663.723 ? 2.852 ops/ms
> FloatScalar.POW 1024 thrpt 5 30.682 ? 0.095 ops/ms | FloatScalar.POW 1024 thrpt 5 30.678 ? 0.074 ops/ms
>
> Thanks.
> Best regards,
> Jie
@DamonFool It will be good to fix this in the platform specific part as follows:
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index bd2e58b..cc776bd 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -7352,12 +7352,6 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_vector_atan_double64 = CAST_FROM_FN_PTR(address, __svml_atan1_ha_l9);
StubRoutines::_vector_atan_double128 = CAST_FROM_FN_PTR(address, __svml_atan2_ha_l9);
StubRoutines::_vector_atan_double256 = CAST_FROM_FN_PTR(address, __svml_atan4_ha_l9);
- StubRoutines::_vector_pow_float64 = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
- StubRoutines::_vector_pow_float128 = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
- StubRoutines::_vector_pow_float256 = CAST_FROM_FN_PTR(address, __svml_powf8_ha_l9);
- StubRoutines::_vector_pow_double64 = CAST_FROM_FN_PTR(address, __svml_pow1_ha_l9);
- StubRoutines::_vector_pow_double128 = CAST_FROM_FN_PTR(address, __svml_pow2_ha_l9);
- StubRoutines::_vector_pow_double256 = CAST_FROM_FN_PTR(address, __svml_pow4_ha_l9);
StubRoutines::_vector_hypot_float64 = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
StubRoutines::_vector_hypot_float128 = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
StubRoutines::_vector_hypot_float256 = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_l9);
@@ -7461,12 +7455,6 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_vector_atan_double64 = CAST_FROM_FN_PTR(address, __svml_atan1_ha_e9);
StubRoutines::_vector_atan_double128 = CAST_FROM_FN_PTR(address, __svml_atan2_ha_e9);
StubRoutines::_vector_atan_double256 = CAST_FROM_FN_PTR(address, __svml_atan4_ha_e9);
- StubRoutines::_vector_pow_float64 = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
- StubRoutines::_vector_pow_float128 = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
- StubRoutines::_vector_pow_float256 = CAST_FROM_FN_PTR(address, __svml_powf8_ha_e9);
- StubRoutines::_vector_pow_double64 = CAST_FROM_FN_PTR(address, __svml_pow1_ha_e9);
- StubRoutines::_vector_pow_double128 = CAST_FROM_FN_PTR(address, __svml_pow2_ha_e9);
- StubRoutines::_vector_pow_double256 = CAST_FROM_FN_PTR(address, __svml_pow4_ha_e9);
StubRoutines::_vector_hypot_float64 = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
StubRoutines::_vector_hypot_float128 = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
StubRoutines::_vector_hypot_float256 = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_e9);
@@ -7551,10 +7539,6 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_vector_hypot_float128 = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);
StubRoutines::_vector_hypot_double64 = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_ex);
StubRoutines::_vector_hypot_double128 = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_ex);
- StubRoutines::_vector_pow_float64 = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);
- StubRoutines::_vector_pow_float128 = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);
- StubRoutines::_vector_pow_double64 = CAST_FROM_FN_PTR(address, __svml_pow1_ha_ex);
- StubRoutines::_vector_pow_double128 = CAST_FROM_FN_PTR(address, __svml_pow2_ha_ex);
StubRoutines::_vector_cbrt_float64 = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);
StubRoutines::_vector_cbrt_float128 = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);
StubRoutines::_vector_cbrt_double64 = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_ex);
-------------
PR: https://git.openjdk.java.net/panama-vector/pull/42
More information about the panama-dev
mailing list