Performance of memory var handles in hot loops
Antoine Chambille
ach at activeviam.com
Tue Apr 7 17:35:39 UTC 2020
So the performance for this use case is indeed better with indexed var
handles, but still several times slower than arrays, array handles or
unsafe.
Anecdotally manually unrolling the loop improves the performance with
direct arrays and unsafe but reduces the performance for var handles.
Benchmark Mode Cnt Score Error
Units
scalarIndexedMemoryHandle thrpt 5 861165.702 ± 24881.228
ops/s
scalarIndexedMemoryHandleUnrolled thrpt 5 710100.700 ± 10745.695
ops/s
scalarArray thrpt 5 5355842.947 ± 156916.658
ops/s
scalarArrayUnrolled thrpt 5 7201839.924 ± 187685.786
ops/s
scalarArrayHandle thrpt 5 5170506.272 ± 103758.960
ops/s
scalarArrayHandleUnrolled thrpt 5 1986432.326 ± 41820.975
ops/s
scalarUnsafe thrpt 5 1937789.077 ± 27491.449
ops/s
scalarUnsafeUnrolled thrpt 5 3026376.816 ± 530965.111
ops/s
-Antoine
package com.activeviam;
import jdk.incubator.foreign.*;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import sun.misc.Unsafe;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.lang.reflect.Field;
import java.nio.ByteOrder;
/**
* Benchmark the element wise aggregation of an array
* of doubles into another array of doubles, using
* combinations of java arrays, byte buffers, standard java code
* and the new Vector API.
*/
public class AddBenchmark {
static {
System.setProperty("jdk.incubator.foreign.Foreign","permit");
}
static final Foreign F = Foreign.getInstance();
static final Unsafe U = getUnsafe();
static Unsafe getUnsafe() {
try {
Field f = Unsafe.class.getDeclaredField("theUnsafe");
f.setAccessible(true);
return (Unsafe) f.get(null);
} catch(Exception e) {
throw new RuntimeException(e);
}
}
/** Manually launch JMH */
public static void main(String[] params) throws Exception {
Options opt = new OptionsBuilder()
.include(AddBenchmark.class.getSimpleName())
.forks(1)
.warmupIterations(5)
.measurementIterations(5)
.build();
new Runner(opt).run();
}
final static int SIZE = 1024;
@State(Scope.Benchmark)
public static class Data {
final double[] inputArray;
final double[] outputArray;
final long inputAddress;
final long outputAddress;
final MemoryAddress inputMA;
final MemoryAddress outputMA;
public Data() {
this.inputArray = new double[SIZE];
this.outputArray = new double[SIZE];
this.inputAddress = U.allocateMemory(8 * SIZE);
this.outputAddress = U.allocateMemory(8 * SIZE);
this.inputMA = F.withSize(MemoryAddress.ofLong(inputAddress),
8*SIZE);
this.outputMA = F.withSize(MemoryAddress.ofLong(outputAddress),
8*SIZE);
}
}
@Benchmark
public void scalarArray(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < SIZE; i++) {
output[i] += input[i];
}
}
@Benchmark
public void scalarArrayUnrolled(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < SIZE; i+=4) {
output[i] += input[i];
output[i+1] += input[i+1];
output[i+2] += input[i+2];
output[i+3] += input[i+3];
}
}
static final VarHandle AH =
MethodHandles.arrayElementVarHandle(double[].class);
@Benchmark
public void scalarArrayHandle(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < input.length; i++) {
AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
}
}
@Benchmark
public void scalarArrayHandleUnrolled(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < input.length; i+=4) {
AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
AH.set(output, i+1, (double) AH.get(input, i+1) + (double)
AH.get(output, i+1));
AH.set(output, i+2, (double) AH.get(input, i+2) + (double)
AH.get(output, i+2));
AH.set(output, i+3, (double) AH.get(input, i+3) + (double)
AH.get(output, i+3));
}
}
@Benchmark
public void scalarUnsafe(Data state) {
final long ia = state.inputAddress;
final long oa = state.outputAddress;
for(int i = 0; i < SIZE; i++) {
U.putDouble(oa + 8*i, U.getDouble(ia + 8*i) + U.getDouble(oa +
8*i));
}
}
@Benchmark
public void scalarUnsafeUnrolled(Data state) {
final long ia = state.inputAddress;
final long oa = state.outputAddress;
for(int i = 0; i < SIZE; i+=4) {
U.putDouble(oa + 8*i, U.getDouble(ia + 8*i) + U.getDouble(oa +
8*i));
U.putDouble(oa + 8*(i+1), U.getDouble(ia + 8*(i+1)) +
U.getDouble(oa + 8*(i+1)));
U.putDouble(oa + 8*(i+2), U.getDouble(ia + 8*(i+2)) +
U.getDouble(oa + 8*(i+2)));
U.putDouble(oa + 8*(i+3), U.getDouble(ia + 8*(i+3)) +
U.getDouble(oa + 8*(i+3)));
}
}
static final VarHandle IH =
MemoryLayout.ofSequence(MemoryLayouts.JAVA_DOUBLE)
.varHandle(double.class,
MemoryLayout.PathElement.sequenceElement());
@Benchmark
public void scalarIndexedMemoryHandle(Data state) {
final MemoryAddress ia = state.inputMA;
final MemoryAddress oa = state.outputMA;
for(int i = 0; i < SIZE; i++) {
IH.set(oa, (long) i, (double) IH.get(ia, (long) i) + (double)
IH.get(oa, (long) i));
}
}
@Benchmark
public void scalarIndexedMemoryHandleUnrolled(Data state) {
final MemoryAddress ia = state.inputMA;
final MemoryAddress oa = state.outputMA;
for(int i = 0; i < SIZE; i+=4) {
IH.set(oa, (long) i, (double) IH.get(ia, (long) i) + (double)
IH.get(oa, (long) i));
IH.set(oa, (long) (i+1), (double) IH.get(ia, (long) (i+1)) +
(double) IH.get(oa, (long) (i+1)));
IH.set(oa, (long) (i+2), (double) IH.get(ia, (long) (i+2)) +
(double) IH.get(oa, (long) (i+2)));
IH.set(oa, (long) (i+3), (double) IH.get(ia, (long) (i+3)) +
(double) IH.get(oa, (long) (i+3)));
}
}
}
On Tue, Apr 7, 2020 at 6:34 PM Antoine Chambille <ach at activeviam.com> wrote:
> Thank you guys, I thought MemoryAddress::addOffset was the optimized case.
>
> Let me try with an indexed var handle.
>
> -Antoine
>
>
>
> On Tue, Apr 7, 2020 at 4:07 PM Maurizio Cimadamore <
> maurizio.cimadamore at oracle.com> wrote:
>
>>
>> On 07/04/2020 15:04, Maurizio Cimadamore wrote:
>> > P.S.
>> >
>> > I'm also pretty sure that, while the code above can match Unsafe for
>> > 'int' carriers, the alignment check introduced for other carriers
>> > might cause some performance degradation. That's another performance
>> > pothole we're aware of.
>>
>> This is not 100% correct - optimizations should work correctly for all
>> carriers, assuming you use VarHandle::get or VarHandle::set. All other
>> VarHandle access primitives will add extra alignment checks which might
>> deteriorate performances.
>>
>> Maurizio
>>
>>
>
More information about the panama-dev
mailing list