Performance of memory var handles in hot loops

Tue Apr 7 13:55:27 UTC 2020

Can you please paste or link to the code that is doing the memory access?

There are some finicky details that have to be written correctly for C2 
to be able to hoist all the relevant checks. This is due to some 
performance work ongoing in the area. So I'd like to make sure that the 
code you wrote is avoiding the "common traps" when interacting with 
memory access handles.

Thanks
Maurizio

On 07/04/2020 12:57, Antoine Chambille wrote:
> Hi folks,
>
> We are doing preparation work to replace sun.misc.Unsafe with the new
> foreign memory API. The project is an in-memory analytical database.
>
> One microbenchmark gives surprising results: adding a double array into
> another double array. ai = ai + bi for each coordinate i. We test with the
> data stored in java primitive arrays, accessed directly or with array var
> handles, or stored off-heap and accessed either with Unsafe or memory var
> handles. Each time with or without manual loop unrolling.
>
> Here are the results when running on the latest foreign-api branch, on a
> Windows laptop with core i9-8950HK:
>
>
> Benchmark                       Mode  Cnt        Score        Error  Units
> scalarArray                    thrpt    5  5436427.848 ±  71729.532  ops/s
> scalarArrayUnrolled            thrpt    5  7651573.168 ± 289335.078  ops/s
> scalarArrayHandle              thrpt    5  5248177.730 ±  91726.870  ops/s
> scalarArrayHandleUnrolled      thrpt    5  2010681.586 ±  45132.245  ops/s
> scalarArrayLongStride          thrpt    5  1231380.354 ±  47245.382  ops/s
> scalarArrayLongStrideUnrolled  thrpt    5  1681608.204 ±  43972.861  ops/s
> scalarUnsafe                   thrpt    5  1983802.305 ±  79177.977  ops/s
> scalarUnsafeUnrolled           thrpt    5  2991610.868 ±  30353.187  ops/s
> scalarSegment                  thrpt    5   353374.324 ±   5041.406  ops/s
> scalarSegmentUnrolled          thrpt    5    62768.670 ±    834.771  ops/s
>
>
>
> Whith java arrays, automatic loop unrolling and automatic vectorization
> deliver good performance. Using var handles  preserves that performance
> entirely. When using a long counter for the loop, automatic unrolling and
> vectorization are disabled and performance is divided by four. Manual
> unrolling further improves the performance of arrays except when var
> handles are used.
>
> With Unsafe I think we still have automatic loop unrolling but lose
> automatic vectorization. It's still quite fast, maybe because there is zero
> boundary check.
>
> With memory var handles the performance is an order of performance slower
> than the others (two orders with manual unrolling which looks like an
> anomaly). Can this be explained and maybe fixed?
>
>
> Cheers,
> -Antoine
>
>
>
>
>
>
>
> package com.activeviam;
>
> import jdk.incubator.foreign.Foreign;
> import jdk.incubator.foreign.MemoryAddress;
> import jdk.incubator.foreign.MemoryHandles;
> import org.openjdk.jmh.annotations.*;
> import org.openjdk.jmh.runner.Runner;
> import org.openjdk.jmh.runner.options.Options;
> import org.openjdk.jmh.runner.options.OptionsBuilder;
> import sun.misc.Unsafe;
>
> import java.lang.invoke.MethodHandles;
> import java.lang.invoke.VarHandle;
> import java.lang.reflect.Field;
> import java.nio.ByteOrder;
>
> /**
>   * Benchmark the element wise aggregation of an array
>   * of doubles into another array of doubles, using
>   * combinations of  java arrays, byte buffers, standard java code
>   * and the new Vector API.
>   */
> public class AddBenchmark {
>
>      static {
>          System.setProperty("jdk.incubator.foreign.Foreign","permit");
>      }
>      static final Foreign F = Foreign.getInstance();
>
>      static final Unsafe U = getUnsafe();
>      static Unsafe getUnsafe() {
>          try {
>              Field f = Unsafe.class.getDeclaredField("theUnsafe");
>              f.setAccessible(true);
>              return (Unsafe) f.get(null);
>          } catch(Exception e) {
>              throw new RuntimeException(e);
>          }
>      }
>
>      /** Manually launch JMH */
>      public static void main(String[] params) throws Exception {
>          Options opt = new OptionsBuilder()
>              .include(AddBenchmark.class.getSimpleName())
>              .forks(1)
>              .build();
>
>          new Runner(opt).run();
>      }
>
>      final static int SIZE = 1024;
>
>      @State(Scope.Benchmark)
>      public static class Data {
>
>          final double[] inputArray;
>          final double[] outputArray;
>          final long inputAddr;
>          final long outputAddr;
>          final MemoryAddress inputMA;
>          final MemoryAddress outputMA;
>
>
>          public Data() {
>              this.inputArray = new double[SIZE];
>              this.outputArray = new double[SIZE];
>              this.inputAddr = U.allocateMemory(8 * SIZE);
>              this.outputAddr = U.allocateMemory(8 * SIZE);
>              this.inputMA = F.withSize(MemoryAddress.ofLong(inputAddr),
> 8*SIZE);
>              this.outputMA = F.withSize(MemoryAddress.ofLong(outputAddr),
> 8*SIZE);
>          }
>      }
>
>      @Benchmark
>      public void scalarArray(Data state) {
>          final double[] input = state.inputArray;
>          final double[] output = state.outputArray;
>          for(int i = 0; i < SIZE; i++) {
>              output[i] += input[i];
>          }
>      }
>
>      @Benchmark
>      public void scalarArrayUnrolled(Data state) {
>          final double[] input = state.inputArray;
>          final double[] output = state.outputArray;
>          for(int i = 0; i < SIZE; i+=4) {
>              output[i] += input[i];
>              output[i+1] += input[i+1];
>              output[i+2] += input[i+2];
>              output[i+3] += input[i+3];
>          }
>      }
>
>      @Benchmark
>      public void scalarArrayLongStride(Data state) {
>          final double[] input = state.inputArray;
>          final double[] output = state.outputArray;
>
>          // Using a long counter defeats loop unrolling and then
> vectorization
>          for(long i = 0; i < SIZE; i++) {
>              output[(int)i] += input[(int)i];
>          }
>      }
>
>      @Benchmark
>      public void scalarArrayLongStrideUnrolled(Data state) {
>          final double[] input = state.inputArray;
>          final double[] output = state.outputArray;
>
>          // Using a long counter defeats loop unrolling and then
> vectorization
>          for(long i = 0; i < SIZE; i+=4) {
>              output[(int)i] += input[(int)i];
>              output[(int)i+1] += input[(int)i+1];
>              output[(int)i+2] += input[(int)i+2];
>              output[(int)i+3] += input[(int)i+3];
>          }
>      }
>
>      static final VarHandle AH =
> MethodHandles.arrayElementVarHandle(double[].class);
>
>      @Benchmark
>      public void scalarArrayHandle(Data state) {
>          final double[] input = state.inputArray;
>          final double[] output = state.outputArray;
>          for(int i = 0; i < input.length; i++) {
>              AH.set(output, i, (double) AH.get(input, i) + (double)
> AH.get(output, i));
>          }
>      }
>
>      @Benchmark
>      public void scalarArrayHandleUnrolled(Data state) {
>          final double[] input = state.inputArray;
>          final double[] output = state.outputArray;
>          for(int i = 0; i < input.length; i+=4) {
>              AH.set(output, i, (double) AH.get(input, i) + (double)
> AH.get(output, i));
>              AH.set(output, i+1, (double) AH.get(input, i+1) + (double)
> AH.get(output, i+1));
>              AH.set(output, i+2, (double) AH.get(input, i+2) + (double)
> AH.get(output, i+2));
>              AH.set(output, i+3, (double) AH.get(input, i+3) + (double)
> AH.get(output, i+3));
>          }
>      }
>
>      @Benchmark
>      public void scalarUnsafe(Data state) {
>          final long inputAddr = state.inputAddr;
>          final long outputAddr = state.outputAddr;
>          for(int i = 0; i < SIZE; i++) {
>              U.putDouble(outputAddr + 8*i, U.getDouble(inputAddr + 8*i) +
> U.getDouble(outputAddr + 8*i));
>          }
>      }
>
>      @Benchmark
>      public void scalarUnsafeUnrolled(Data state) {
>          final long inputAddr = state.inputAddr;
>          final long outputAddr = state.outputAddr;
>          for(int i = 0; i < SIZE; i+=4) {
>              U.putDouble(outputAddr + 8*i, U.getDouble(inputAddr + 8*i) +
> U.getDouble(outputAddr + 8*i));
>              U.putDouble(outputAddr + 8*(i+1), U.getDouble(inputAddr +
> 8*(i+1)) + U.getDouble(outputAddr + 8*(i+1)));
>              U.putDouble(outputAddr + 8*(i+2), U.getDouble(inputAddr +
> 8*(i+2)) + U.getDouble(outputAddr + 8*(i+2)));
>              U.putDouble(outputAddr + 8*(i+3), U.getDouble(inputAddr +
> 8*(i+3)) + U.getDouble(outputAddr + 8*(i+3)));
>          }
>      }
>
>      static final VarHandle MH = MemoryHandles.varHandle(double.class,
> ByteOrder.nativeOrder());
>
>      @Benchmark
>      public void scalarSegment(Data state) {
>          final MemoryAddress ia = state.inputMA;
>          final MemoryAddress oa = state.outputMA;
>          for(int i = 0; i < SIZE; i++) {
>              MH.set(oa.addOffset(8*i),
>                      (double) MH.get(ia.addOffset(8*i)) +
>                              (double) MH.get(oa.addOffset(8*i)));
>          }
>      }
>
>      @Benchmark
>      public void scalarSegmentUnrolled(Data state) {
>          final MemoryAddress ia = state.inputMA;
>          final MemoryAddress oa = state.outputMA;
>          for(int i = 0; i < SIZE; i+=4) {
>              MH.set(oa.addOffset(8*i),
>                      (double) MH.get(ia.addOffset(8*i)) +
>                              (double) MH.get(oa.addOffset(8*i)));
>
>              MH.set(oa.addOffset(8*(i+1)),
>                      (double) MH.get(ia.addOffset(8*(i+1))) +
>                              (double) MH.get(oa.addOffset(8*(i+1))));
>
>              MH.set(oa.addOffset(8*(i+2)),
>                      (double) MH.get(ia.addOffset(8*(i+2))) +
>                              (double) MH.get(oa.addOffset(8*(i+2))));
>
>              MH.set(oa.addOffset(8*(i+3)),
>                      (double) MH.get(ia.addOffset(8*(i+3))) +
>                              (double) MH.get(oa.addOffset(8*(i+3))));
>          }
>      }
>
>
> }