[vector] Perf difference between vector-unstable and vectorInstrinsics

Vladimir Ivanov vladimir.x.ivanov at oracle.com
Mon Jul 6 21:17:02 UTC 2020


Hi Zhuoren,

I haven't investigated what is actually causes the difference, but 
seeing reduceLanes() calls [1] after the loop I suspect it is caused by 
inlining issues. (You can verify that by looking at -XX:+PrintInlining 
output.)

In vectorIntrinsics branch there's a fix integrated recently which makes 
  inlining of vector operations more robust:
   https://hg.openjdk.java.net/panama/dev/rev/5b601a43ac88

Best regards,
Vladimir Ivanov

[1]
 >          float sum = vecSum.reduceLanes(VectorOperators.ADD);
 >          float xSquare = xSquareV.reduceLanes(VectorOperators.ADD);
 >          float ySquare = ySquareV.reduceLanes(VectorOperators.ADD);


On 06.07.2020 14:42, Wang Zhuo(Zhuoren) wrote:
> Hi, I am implementing Cosine Distance using Vector API, while I found that performance of my algorithm on vector-unstable is much better than vectorInstrinsics
> On vectorInstrinsics:
> normal time used:965
> vector time used:2529
> On vector-unstable:
> normal time used:968
> vector time used:226
> The numbers are time (in ms), the smaller the better.
> I wonder if there are some differences between the two branches that cause this perf difference?
> The test code is below, please check.
> Command I used to run:
> java  --add-modules=jdk.incubator.vector  -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0  VectorSimilarity
> 
> import jdk.incubator.vector.*;
> import java.util.Base64;
> import java.util.concurrent.TimeUnit;
> public class VectorSimilarity {
>      static final VectorSpecies<Float> SPECIES256 = FloatVector.SPECIES_256;
>      static final VectorSpecies<Float> SPECIES512 = FloatVector.SPECIES_512;
>      private static String x1 = "L5GSwXhHpEH05mNBHnmcQMTw3EBnagFCW1DGQHe/nUFO1B1BlJOpwCBJ9j" +
>                  "+RkY1BzqKeQSglN0Gy7krB5CSfQFzxB8Djn5nB2KNFwKcSRMGYzRQ7qMGWQZ0FF0FTceDAIKjxv/zhdkHFZMHB6hU4QZbo2cCAryRB+7OOQCxbfEHRtBlBxPG6P0BYSD+Pgz9BqzOLv/nVO8C9x5/BQOY/wTTIx0GfW1BBGv2lQQwdDcGCqBfB12t/QKUBoEEejIXBPN9kQWsFbEGsGcnBkqJkwKhLgr/IQZxAelAWQfcYpcFQv0HBeiGCQWExhEDrKAnBpAwBQV4bVcFpGNjAyDsNQVOc+0CSc4nBgG/ZQQGRccEXts9BKhYzQNK5+MAlU0DBzPGWwPGRCcEZC5/ADxOcv7lUkEBomM5BuqKiwV2MU8HNGHDBSB84QZRSyMB8RZlBVFdZQXSVgcBTQQBCdWa/QBQ0qkGILUW/6NA9QQnkmsG+5PPBj0UowT6nYD9cwpjAS/w5wTbX2UH8Gb5AR/HUQMTNAMJ9MN9AgHoqPbbUyUFbe47BBHANQWZJBsGBuPlBy94EQADeXsG5eOtBnA+yQCRka8EMcGLBjuoRwb4k7sAasB5Bmk/UwaI1akErp6xBq5G5wNo1E8KHa7tB3IiKQTCffcHphK1BTgJzwVY3JEEip/VAlmgXQSeKCsLEABs/n1/xwL5u58CgQY49ahUWQoAJjj1hhqBASXrrQb6nM0H2fY+/thtbQAQobMAohvXAxM3xv7xyqD+MvpDBrlDiQfBvPcGA8X5AQE4SwXhGx7+uLA1AxY8xu2mVjEE7KlFBArveQFNMtUD3N7DB12BbQcyH4cFhSw3Bu5VWQeTW0z9o03TBxtMlQctp/8E/lLVAGUtTwZsGJMKv/R5A1HKVQV6RhsC1Ji5AcXLFQJd6f0HbB+e+ZDi8wV9tQ0FwCN/B+A89v2DrU0Bcpc5BglTeQH5dT0HePS9Al4XPwdA6YEFlueXAbWKSQSBWzkBy2RnCt9Yawl9b77+xgxBC9eCqQd8f0kFoBG9BVxrkQZh2QkHNW/zBEQiawLJEocDhutTA8zEYwbIvEUIO1T9BmlOTwIhbNEDhrtlAVk9BQARQaj89NQNC6usGwDfQrkBSJrlAON7FQQ8FqsEEc/TAY3zeQYsqUEHV8QPBHJoYQQdn5kGyCiJBlDMYQBBNoUFrxbw/NlmPP3B24j6ChIdBXk2bwdxdDMFQw1rA4hybQXTchr8d9wvBuCbLQSMKmMBH4RpBQIXePa5DT8IjgvtBgAetQZgGgMEprc1BAOeSPJ5XpEEMa0NBgX4uwX7XIsG2Ie0688iqQSpJPsCAy9LBAGHkPw==";    private static String x2 = "5R3ZwGPrxEFMKyNBLFSeQdYav0BQtDFCur7WQAgRYEGHFYC/MKZtvkiFUT+RNXfBVsGBP2KWSUCmAUTBIf+EQG57kMCtXo7BV1DuwLd98r+YzRQ7qKXNwBMSPUFNQffBPrxeQYw1t7/7JjFAKNaXP+cMSEG6GI5BuEx0wUANDMEvDqdAT9YEworQTEEiVBZBiMejQP7t67+iRwzB3HadQB1be0Ei5g5BMt+cQXvYTUHwZsLAuoy3QfrR6EFrIiHB5X8Dwc8XbUH8Yr8/AvGEwa5GkUH3F5tAP8YJQTiDyz+gKsRAFl/rwDxJuUAPyyxBvg2gQU6bjMEPEa7Bz6wYQpQy7MDF5LvB8HP+QCJdicHQDjpC6RpWQcGeY8FMK6vBoeUjQcPYmUG2QmRBBI0nwScESsGMAcxBvRmawRL2A8IByKNAgTQBQuxdDEGq8JBBHJWmQSBDfz8sLe9BE3gFwTdCPkHEaMxBhX8Xwe7BCcE/783Bt6EHwdpbpkHc5L/BCPzRwUdIQUEd/k3AoGNcQQwNmMEyuKRBtnWlwdCBAUI5Y5DBwOZYvdI+MsEu/ixBnpMrwRtYt8FECytC6JjEQW3RHcBtfn3B+sgQQcyQKcEI5ytByvw2wPZdaUH+aqLAQFQ+QPi4REBF/9lBCvJNQTdlEcIAMbzBtD+hwZWufsAEjus/YRyjwR1YuMHj0ZhBa4w+QORAhMEq9qdB/L8JQrjhyUAJBeBAKqoIQUnAq0GsLFdBkfrvQHc1zMHH6THBeggSwaJIOsAawwBBDDWqPwrAlkBYDqe/maUcQabhwsFF2VBBxY8xu5aMQUFDkHVBKhRRwHhgWsEA5jXBlh9NQVMaT0CWlhTAroaFQRyciUHQlp7BF4trQa8unsE4TfI+9XLJQDNpLcIXLZdAuX2MwShiTsFcQh5BrHMqQVI1+UBWe4fBAzi0wfe11UFAIjq9Y1iAQDxrTsEY6plB/JiXQfjFwkHkYGRBVNOhwCMxtEFbqZTA378WQeA/Sb+FrSXCqlYywtb5SsDcqlZBk1EtQZ/RREHZIxG/kcv8QekDIkHPsDXCBL4VQHN8CMGtNvvAC3YwweUuAkKkJCnANEtVQG9z/0DrwyTBQ9hnwWX3kMEdLB1CvIlKwQ0IO0HK1ErBvdRQQVpjMMCJDI/Bb4X8QYVipEGpG2nBeLGUvmBlBT7ISgRB4iGAQUunkkFDFLm/HNaqPzKTVkCITJG/XzlYwbj0XcGD60PBbpLwQbvrs8Az8RXB4ubxQXh/HEDtXLU/kONrwVBs4MGc2X1BJaHkQd0ByEAKXLJBTq7JwPPkJUGJIIRBlh57wX3FjcC2Ie060Qc6Qal5xcCfqQrCl7edQQ==";
>      static float[] v1 = parseBase64ToVector(x1);
>      static float[] v2 = parseBase64ToVector(x2);
>      public static float[] parseArray(byte[] input) {
>          if (input == null) {
>              return null;
>          }
>          float[] floatArr = new float[input.length / 4];
>          for (int i = 0; i < floatArr.length; i++) {
>              int l;
>              l = input[i << 2];
>              l &= 0xff;
>              l |= ((long) input[(i << 2) + 1] << 8);
>              l &= 0xffff;
>              l |= ((long) input[(i << 2) + 2] << 16);
>              l &= 0xffffff;
>              l |= ((long) input[(i << 2) + 3] << 24);
>              floatArr[i] = Float.intBitsToFloat(l);
>          }
>          return floatArr;
>      }
>      public static float[] parseBase64ToVector(String vectorBase64) {
>          return parseArray(Base64.getDecoder().decode(vectorBase64));
>      }    public static float getCosineSimilaritySIMD(float[] queryVector, float[] vector) {
>          FloatVector vecX, vecY, vecSum, xSquareV, ySquareV;
>          vecSum = FloatVector.zero(SPECIES256);
>          xSquareV = FloatVector.zero(SPECIES256);
>          ySquareV = FloatVector.zero(SPECIES256);;
>          int i= 0;
>          for (i = 0; i + (SPECIES256.length()) <= queryVector.length; i += SPECIES256.length()) {
>              vecX = FloatVector.fromArray(SPECIES256, queryVector, i);
>              vecY = FloatVector.fromArray(SPECIES256, vector, i);
>              vecSum = vecX.mul(vecY).add(vecSum);
>              vecSum = vecX.fma(vecY, vecSum);
>              xSquareV = vecX.fma(vecX, xSquareV);
>              ySquareV = vecY.fma(vecY, ySquareV);
>              xSquareV = vecX.mul(vecX).add(xSquareV);
>              ySquareV = vecY.mul(vecY).add(ySquareV);
>              vecX.intoArray(vector, i);
>              vecY.intoArray(queryVector, i);
>          }
>          float sum = vecSum.reduceLanes(VectorOperators.ADD);
>          float xSquare = xSquareV.reduceLanes(VectorOperators.ADD);
>          float ySquare = ySquareV.reduceLanes(VectorOperators.ADD);
>          for (; i < queryVector.length; i++) {
>              sum += queryVector[i] * vector[i];
>              xSquare += queryVector[i] * queryVector[i];
>              ySquare += vector[i] * vector[i];
>          }
>          if (ySquare < 1e-8) {
>              return 0;
>          }
>          return (float)(sum / Math.sqrt(xSquare * ySquare));
>      }    public static float getCosineSimilarityScalar(float[] queryVector, float[] vector) {
>          float sum = 0;
>          float xSquare = 0;
>          float ySquare = 0;
>          for (int i = 0; i < queryVector.length; i++) {
>              //queryVector[i] = vector[i];
>              sum += (float)(queryVector[i] * vector[i]);
>              xSquare += (float)(queryVector[i] * queryVector[i]);
>              ySquare += (float)(vector[i] * vector[i]);
>          }
>          if (ySquare < 1e-8) {
>              return 0;
>          }
>          return (float)(sum / Math.sqrt(xSquare * ySquare));
>      }
>      public static void main(String[] args) {
>          long t1, t2;
>          for (int i = 0; i < 100000; i++) {
>              getCosineSimilaritySIMD(v1, v2);
>              getCosineSimilarityScalar(v1, v2);
>          }
>          System.out.println("normal result " + getCosineSimilarityScalar(v1, v2) + " vec result " + getCosineSimilaritySIMD(v1, v2));
>          t1 = System.currentTimeMillis();
>          for (int i = 0; i < 2000000; i++) {
>              getCosineSimilarityScalar(v1, v2);
>          }
>          System.out.println("normal time used:" + (System.currentTimeMillis() - t1));
>          t2 = System.currentTimeMillis();
>          for (int i = 0; i < 2000000; i++) {
>              getCosineSimilaritySIMD(v1, v2);
>          }
>          System.out.println("vector time used:" + (System.currentTimeMillis() - t2));
>      }
> }
> 
> Regards,
> Zhuoren
> 
> 
> 
> 
> 
> 


More information about the panama-dev mailing list