Performance of memory var handles in hot loops

Tue Apr 7 11:57:04 UTC 2020

Hi folks,

We are doing preparation work to replace sun.misc.Unsafe with the new
foreign memory API. The project is an in-memory analytical database.

One microbenchmark gives surprising results: adding a double array into
another double array. ai = ai + bi for each coordinate i. We test with the
data stored in java primitive arrays, accessed directly or with array var
handles, or stored off-heap and accessed either with Unsafe or memory var
handles. Each time with or without manual loop unrolling.

Here are the results when running on the latest foreign-api branch, on a
Windows laptop with core i9-8950HK:

Benchmark                       Mode  Cnt        Score        Error  Units
scalarArray                    thrpt    5  5436427.848 ±  71729.532  ops/s
scalarArrayUnrolled            thrpt    5  7651573.168 ± 289335.078  ops/s
scalarArrayHandle              thrpt    5  5248177.730 ±  91726.870  ops/s
scalarArrayHandleUnrolled      thrpt    5  2010681.586 ±  45132.245  ops/s
scalarArrayLongStride          thrpt    5  1231380.354 ±  47245.382  ops/s
scalarArrayLongStrideUnrolled  thrpt    5  1681608.204 ±  43972.861  ops/s
scalarUnsafe                   thrpt    5  1983802.305 ±  79177.977  ops/s
scalarUnsafeUnrolled           thrpt    5  2991610.868 ±  30353.187  ops/s
scalarSegment                  thrpt    5   353374.324 ±   5041.406  ops/s
scalarSegmentUnrolled          thrpt    5    62768.670 ±    834.771  ops/s

Whith java arrays, automatic loop unrolling and automatic vectorization
deliver good performance. Using var handles  preserves that performance
entirely. When using a long counter for the loop, automatic unrolling and
vectorization are disabled and performance is divided by four. Manual
unrolling further improves the performance of arrays except when var
handles are used.

With Unsafe I think we still have automatic loop unrolling but lose
automatic vectorization. It's still quite fast, maybe because there is zero
boundary check.

With memory var handles the performance is an order of performance slower
than the others (two orders with manual unrolling which looks like an
anomaly). Can this be explained and maybe fixed?

Cheers,
-Antoine

package com.activeviam;

import jdk.incubator.foreign.Foreign;
import jdk.incubator.foreign.MemoryAddress;
import jdk.incubator.foreign.MemoryHandles;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import sun.misc.Unsafe;

import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.lang.reflect.Field;
import java.nio.ByteOrder;

/**
 * Benchmark the element wise aggregation of an array
 * of doubles into another array of doubles, using
 * combinations of  java arrays, byte buffers, standard java code
 * and the new Vector API.
 */
public class AddBenchmark {

    static {
        System.setProperty("jdk.incubator.foreign.Foreign","permit");
    }
    static final Foreign F = Foreign.getInstance();

    static final Unsafe U = getUnsafe();
    static Unsafe getUnsafe() {
        try {
            Field f = Unsafe.class.getDeclaredField("theUnsafe");
            f.setAccessible(true);
            return (Unsafe) f.get(null);
        } catch(Exception e) {
            throw new RuntimeException(e);
        }
    }

    /** Manually launch JMH */
    public static void main(String[] params) throws Exception {
        Options opt = new OptionsBuilder()
            .include(AddBenchmark.class.getSimpleName())
            .forks(1)
            .build();

        new Runner(opt).run();
    }

    final static int SIZE = 1024;

    @State(Scope.Benchmark)
    public static class Data {

        final double[] inputArray;
        final double[] outputArray;
        final long inputAddr;
        final long outputAddr;
        final MemoryAddress inputMA;
        final MemoryAddress outputMA;

        public Data() {
            this.inputArray = new double[SIZE];
            this.outputArray = new double[SIZE];
            this.inputAddr = U.allocateMemory(8 * SIZE);
            this.outputAddr = U.allocateMemory(8 * SIZE);
            this.inputMA = F.withSize(MemoryAddress.ofLong(inputAddr),
8*SIZE);
            this.outputMA = F.withSize(MemoryAddress.ofLong(outputAddr),
8*SIZE);
        }
    }

    @Benchmark
    public void scalarArray(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < SIZE; i++) {
            output[i] += input[i];
        }
    }

    @Benchmark
    public void scalarArrayUnrolled(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < SIZE; i+=4) {
            output[i] += input[i];
            output[i+1] += input[i+1];
            output[i+2] += input[i+2];
            output[i+3] += input[i+3];
        }
    }

    @Benchmark
    public void scalarArrayLongStride(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;

        // Using a long counter defeats loop unrolling and then
vectorization
        for(long i = 0; i < SIZE; i++) {
            output[(int)i] += input[(int)i];
        }
    }

    @Benchmark
    public void scalarArrayLongStrideUnrolled(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;

        // Using a long counter defeats loop unrolling and then
vectorization
        for(long i = 0; i < SIZE; i+=4) {
            output[(int)i] += input[(int)i];
            output[(int)i+1] += input[(int)i+1];
            output[(int)i+2] += input[(int)i+2];
            output[(int)i+3] += input[(int)i+3];
        }
    }

    static final VarHandle AH =
MethodHandles.arrayElementVarHandle(double[].class);

    @Benchmark
    public void scalarArrayHandle(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < input.length; i++) {
            AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
        }
    }

    @Benchmark
    public void scalarArrayHandleUnrolled(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < input.length; i+=4) {
            AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
            AH.set(output, i+1, (double) AH.get(input, i+1) + (double)
AH.get(output, i+1));
            AH.set(output, i+2, (double) AH.get(input, i+2) + (double)
AH.get(output, i+2));
            AH.set(output, i+3, (double) AH.get(input, i+3) + (double)
AH.get(output, i+3));
        }
    }

    @Benchmark
    public void scalarUnsafe(Data state) {
        final long inputAddr = state.inputAddr;
        final long outputAddr = state.outputAddr;
        for(int i = 0; i < SIZE; i++) {
            U.putDouble(outputAddr + 8*i, U.getDouble(inputAddr + 8*i) +
U.getDouble(outputAddr + 8*i));
        }
    }

    @Benchmark
    public void scalarUnsafeUnrolled(Data state) {
        final long inputAddr = state.inputAddr;
        final long outputAddr = state.outputAddr;
        for(int i = 0; i < SIZE; i+=4) {
            U.putDouble(outputAddr + 8*i, U.getDouble(inputAddr + 8*i) +
U.getDouble(outputAddr + 8*i));
            U.putDouble(outputAddr + 8*(i+1), U.getDouble(inputAddr +
8*(i+1)) + U.getDouble(outputAddr + 8*(i+1)));
            U.putDouble(outputAddr + 8*(i+2), U.getDouble(inputAddr +
8*(i+2)) + U.getDouble(outputAddr + 8*(i+2)));
            U.putDouble(outputAddr + 8*(i+3), U.getDouble(inputAddr +
8*(i+3)) + U.getDouble(outputAddr + 8*(i+3)));
        }
    }

    static final VarHandle MH = MemoryHandles.varHandle(double.class,
ByteOrder.nativeOrder());

    @Benchmark
    public void scalarSegment(Data state) {
        final MemoryAddress ia = state.inputMA;
        final MemoryAddress oa = state.outputMA;
        for(int i = 0; i < SIZE; i++) {
            MH.set(oa.addOffset(8*i),
                    (double) MH.get(ia.addOffset(8*i)) +
                            (double) MH.get(oa.addOffset(8*i)));
        }
    }

    @Benchmark
    public void scalarSegmentUnrolled(Data state) {
        final MemoryAddress ia = state.inputMA;
        final MemoryAddress oa = state.outputMA;
        for(int i = 0; i < SIZE; i+=4) {
            MH.set(oa.addOffset(8*i),
                    (double) MH.get(ia.addOffset(8*i)) +
                            (double) MH.get(oa.addOffset(8*i)));

            MH.set(oa.addOffset(8*(i+1)),
                    (double) MH.get(ia.addOffset(8*(i+1))) +
                            (double) MH.get(oa.addOffset(8*(i+1))));

            MH.set(oa.addOffset(8*(i+2)),
                    (double) MH.get(ia.addOffset(8*(i+2))) +
                            (double) MH.get(oa.addOffset(8*(i+2))));

            MH.set(oa.addOffset(8*(i+3)),
                    (double) MH.get(ia.addOffset(8*(i+3))) +
                            (double) MH.get(oa.addOffset(8*(i+3))));
        }
    }

}