Performance of memory var handles in hot loops

Tue Apr 7 17:35:39 UTC 2020

So the performance for this use case is indeed better with indexed var
handles, but still several times slower than arrays, array handles or
unsafe.

Anecdotally manually unrolling the loop improves the performance with
direct arrays and unsafe but reduces the performance for var handles.

Benchmark                            Mode  Cnt        Score        Error
 Units
scalarIndexedMemoryHandle           thrpt    5   861165.702 ±  24881.228
 ops/s
scalarIndexedMemoryHandleUnrolled   thrpt    5   710100.700 ±  10745.695
 ops/s
scalarArray                         thrpt    5  5355842.947 ± 156916.658
 ops/s
scalarArrayUnrolled                 thrpt    5  7201839.924 ± 187685.786
 ops/s
scalarArrayHandle                   thrpt    5  5170506.272 ± 103758.960
 ops/s
scalarArrayHandleUnrolled           thrpt    5  1986432.326 ±  41820.975
 ops/s
scalarUnsafe                        thrpt    5  1937789.077 ±  27491.449
 ops/s
scalarUnsafeUnrolled                thrpt    5  3026376.816 ± 530965.111
 ops/s

-Antoine

package com.activeviam;

import jdk.incubator.foreign.*;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import sun.misc.Unsafe;

import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.lang.reflect.Field;
import java.nio.ByteOrder;

/**
 * Benchmark the element wise aggregation of an array
 * of doubles into another array of doubles, using
 * combinations of  java arrays, byte buffers, standard java code
 * and the new Vector API.
 */
public class AddBenchmark {

    static {
        System.setProperty("jdk.incubator.foreign.Foreign","permit");
    }
    static final Foreign F = Foreign.getInstance();

    static final Unsafe U = getUnsafe();
    static Unsafe getUnsafe() {
        try {
            Field f = Unsafe.class.getDeclaredField("theUnsafe");
            f.setAccessible(true);
            return (Unsafe) f.get(null);
        } catch(Exception e) {
            throw new RuntimeException(e);
        }
    }

    /** Manually launch JMH */
    public static void main(String[] params) throws Exception {
        Options opt = new OptionsBuilder()
            .include(AddBenchmark.class.getSimpleName())
            .forks(1)
            .warmupIterations(5)
            .measurementIterations(5)
            .build();

        new Runner(opt).run();
    }

    final static int SIZE = 1024;

    @State(Scope.Benchmark)
    public static class Data {

        final double[] inputArray;
        final double[] outputArray;
        final long inputAddress;
        final long outputAddress;
        final MemoryAddress inputMA;
        final MemoryAddress outputMA;

        public Data() {
            this.inputArray = new double[SIZE];
            this.outputArray = new double[SIZE];
            this.inputAddress = U.allocateMemory(8 * SIZE);
            this.outputAddress = U.allocateMemory(8 * SIZE);
            this.inputMA = F.withSize(MemoryAddress.ofLong(inputAddress),
8*SIZE);
            this.outputMA = F.withSize(MemoryAddress.ofLong(outputAddress),
8*SIZE);
        }
    }

    @Benchmark
    public void scalarArray(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < SIZE; i++) {
            output[i] += input[i];
        }
    }

    @Benchmark
    public void scalarArrayUnrolled(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < SIZE; i+=4) {
            output[i] += input[i];
            output[i+1] += input[i+1];
            output[i+2] += input[i+2];
            output[i+3] += input[i+3];
        }
    }

    static final VarHandle AH =
MethodHandles.arrayElementVarHandle(double[].class);

    @Benchmark
    public void scalarArrayHandle(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < input.length; i++) {
            AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
        }
    }

    @Benchmark
    public void scalarArrayHandleUnrolled(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < input.length; i+=4) {
            AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
            AH.set(output, i+1, (double) AH.get(input, i+1) + (double)
AH.get(output, i+1));
            AH.set(output, i+2, (double) AH.get(input, i+2) + (double)
AH.get(output, i+2));
            AH.set(output, i+3, (double) AH.get(input, i+3) + (double)
AH.get(output, i+3));
        }
    }

    @Benchmark
    public void scalarUnsafe(Data state) {
        final long ia = state.inputAddress;
        final long oa = state.outputAddress;
        for(int i = 0; i < SIZE; i++) {
            U.putDouble(oa + 8*i, U.getDouble(ia + 8*i) + U.getDouble(oa +
8*i));
        }
    }

    @Benchmark
    public void scalarUnsafeUnrolled(Data state) {
        final long ia = state.inputAddress;
        final long oa = state.outputAddress;
        for(int i = 0; i < SIZE; i+=4) {
            U.putDouble(oa + 8*i, U.getDouble(ia + 8*i) + U.getDouble(oa +
8*i));
            U.putDouble(oa + 8*(i+1), U.getDouble(ia + 8*(i+1)) +
U.getDouble(oa + 8*(i+1)));
            U.putDouble(oa + 8*(i+2), U.getDouble(ia + 8*(i+2)) +
U.getDouble(oa + 8*(i+2)));
            U.putDouble(oa + 8*(i+3), U.getDouble(ia + 8*(i+3)) +
U.getDouble(oa + 8*(i+3)));
        }
    }

    static final VarHandle IH =
MemoryLayout.ofSequence(MemoryLayouts.JAVA_DOUBLE)
            .varHandle(double.class,
MemoryLayout.PathElement.sequenceElement());

    @Benchmark
    public void scalarIndexedMemoryHandle(Data state) {
        final MemoryAddress ia = state.inputMA;
        final MemoryAddress oa = state.outputMA;

        for(int i = 0; i < SIZE; i++) {
            IH.set(oa, (long) i, (double) IH.get(ia, (long) i) + (double)
IH.get(oa, (long) i));
        }
    }

    @Benchmark
    public void scalarIndexedMemoryHandleUnrolled(Data state) {
        final MemoryAddress ia = state.inputMA;
        final MemoryAddress oa = state.outputMA;

        for(int i = 0; i < SIZE; i+=4) {
            IH.set(oa, (long) i, (double) IH.get(ia, (long) i) + (double)
IH.get(oa, (long) i));
            IH.set(oa, (long) (i+1), (double) IH.get(ia, (long) (i+1)) +
(double) IH.get(oa, (long) (i+1)));
            IH.set(oa, (long) (i+2), (double) IH.get(ia, (long) (i+2)) +
(double) IH.get(oa, (long) (i+2)));
            IH.set(oa, (long) (i+3), (double) IH.get(ia, (long) (i+3)) +
(double) IH.get(oa, (long) (i+3)));
        }
    }

}

On Tue, Apr 7, 2020 at 6:34 PM Antoine Chambille <ach at activeviam.com> wrote:

> Thank you guys, I thought MemoryAddress::addOffset was the optimized case.
>
> Let me try with an indexed var handle.
>
> -Antoine
>
>
>
> On Tue, Apr 7, 2020 at 4:07 PM Maurizio Cimadamore <
> maurizio.cimadamore at oracle.com> wrote:
>
>>
>> On 07/04/2020 15:04, Maurizio Cimadamore wrote:
>> > P.S.
>> >
>> > I'm also pretty sure that, while the code above can match Unsafe for
>> > 'int' carriers, the alignment check introduced for other carriers
>> > might cause some performance degradation. That's another performance
>> > pothole we're aware of.
>>
>> This is not 100% correct - optimizations should work correctly for all
>> carriers, assuming you use VarHandle::get or VarHandle::set. All other
>> VarHandle access primitives will add extra alignment checks which might
>> deteriorate performances.
>>
>> Maurizio
>>
>>
>