Performance of memory var handles in hot loops
Antoine Chambille
ach at activeviam.com
Tue Apr 7 11:57:04 UTC 2020
Hi folks,
We are doing preparation work to replace sun.misc.Unsafe with the new
foreign memory API. The project is an in-memory analytical database.
One microbenchmark gives surprising results: adding a double array into
another double array. ai = ai + bi for each coordinate i. We test with the
data stored in java primitive arrays, accessed directly or with array var
handles, or stored off-heap and accessed either with Unsafe or memory var
handles. Each time with or without manual loop unrolling.
Here are the results when running on the latest foreign-api branch, on a
Windows laptop with core i9-8950HK:
Benchmark Mode Cnt Score Error Units
scalarArray thrpt 5 5436427.848 ± 71729.532 ops/s
scalarArrayUnrolled thrpt 5 7651573.168 ± 289335.078 ops/s
scalarArrayHandle thrpt 5 5248177.730 ± 91726.870 ops/s
scalarArrayHandleUnrolled thrpt 5 2010681.586 ± 45132.245 ops/s
scalarArrayLongStride thrpt 5 1231380.354 ± 47245.382 ops/s
scalarArrayLongStrideUnrolled thrpt 5 1681608.204 ± 43972.861 ops/s
scalarUnsafe thrpt 5 1983802.305 ± 79177.977 ops/s
scalarUnsafeUnrolled thrpt 5 2991610.868 ± 30353.187 ops/s
scalarSegment thrpt 5 353374.324 ± 5041.406 ops/s
scalarSegmentUnrolled thrpt 5 62768.670 ± 834.771 ops/s
Whith java arrays, automatic loop unrolling and automatic vectorization
deliver good performance. Using var handles preserves that performance
entirely. When using a long counter for the loop, automatic unrolling and
vectorization are disabled and performance is divided by four. Manual
unrolling further improves the performance of arrays except when var
handles are used.
With Unsafe I think we still have automatic loop unrolling but lose
automatic vectorization. It's still quite fast, maybe because there is zero
boundary check.
With memory var handles the performance is an order of performance slower
than the others (two orders with manual unrolling which looks like an
anomaly). Can this be explained and maybe fixed?
Cheers,
-Antoine
package com.activeviam;
import jdk.incubator.foreign.Foreign;
import jdk.incubator.foreign.MemoryAddress;
import jdk.incubator.foreign.MemoryHandles;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import sun.misc.Unsafe;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.lang.reflect.Field;
import java.nio.ByteOrder;
/**
* Benchmark the element wise aggregation of an array
* of doubles into another array of doubles, using
* combinations of java arrays, byte buffers, standard java code
* and the new Vector API.
*/
public class AddBenchmark {
static {
System.setProperty("jdk.incubator.foreign.Foreign","permit");
}
static final Foreign F = Foreign.getInstance();
static final Unsafe U = getUnsafe();
static Unsafe getUnsafe() {
try {
Field f = Unsafe.class.getDeclaredField("theUnsafe");
f.setAccessible(true);
return (Unsafe) f.get(null);
} catch(Exception e) {
throw new RuntimeException(e);
}
}
/** Manually launch JMH */
public static void main(String[] params) throws Exception {
Options opt = new OptionsBuilder()
.include(AddBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}
final static int SIZE = 1024;
@State(Scope.Benchmark)
public static class Data {
final double[] inputArray;
final double[] outputArray;
final long inputAddr;
final long outputAddr;
final MemoryAddress inputMA;
final MemoryAddress outputMA;
public Data() {
this.inputArray = new double[SIZE];
this.outputArray = new double[SIZE];
this.inputAddr = U.allocateMemory(8 * SIZE);
this.outputAddr = U.allocateMemory(8 * SIZE);
this.inputMA = F.withSize(MemoryAddress.ofLong(inputAddr),
8*SIZE);
this.outputMA = F.withSize(MemoryAddress.ofLong(outputAddr),
8*SIZE);
}
}
@Benchmark
public void scalarArray(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < SIZE; i++) {
output[i] += input[i];
}
}
@Benchmark
public void scalarArrayUnrolled(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < SIZE; i+=4) {
output[i] += input[i];
output[i+1] += input[i+1];
output[i+2] += input[i+2];
output[i+3] += input[i+3];
}
}
@Benchmark
public void scalarArrayLongStride(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
// Using a long counter defeats loop unrolling and then
vectorization
for(long i = 0; i < SIZE; i++) {
output[(int)i] += input[(int)i];
}
}
@Benchmark
public void scalarArrayLongStrideUnrolled(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
// Using a long counter defeats loop unrolling and then
vectorization
for(long i = 0; i < SIZE; i+=4) {
output[(int)i] += input[(int)i];
output[(int)i+1] += input[(int)i+1];
output[(int)i+2] += input[(int)i+2];
output[(int)i+3] += input[(int)i+3];
}
}
static final VarHandle AH =
MethodHandles.arrayElementVarHandle(double[].class);
@Benchmark
public void scalarArrayHandle(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < input.length; i++) {
AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
}
}
@Benchmark
public void scalarArrayHandleUnrolled(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < input.length; i+=4) {
AH.set(output, i, (double) AH.get(input, i) + (double)
AH.get(output, i));
AH.set(output, i+1, (double) AH.get(input, i+1) + (double)
AH.get(output, i+1));
AH.set(output, i+2, (double) AH.get(input, i+2) + (double)
AH.get(output, i+2));
AH.set(output, i+3, (double) AH.get(input, i+3) + (double)
AH.get(output, i+3));
}
}
@Benchmark
public void scalarUnsafe(Data state) {
final long inputAddr = state.inputAddr;
final long outputAddr = state.outputAddr;
for(int i = 0; i < SIZE; i++) {
U.putDouble(outputAddr + 8*i, U.getDouble(inputAddr + 8*i) +
U.getDouble(outputAddr + 8*i));
}
}
@Benchmark
public void scalarUnsafeUnrolled(Data state) {
final long inputAddr = state.inputAddr;
final long outputAddr = state.outputAddr;
for(int i = 0; i < SIZE; i+=4) {
U.putDouble(outputAddr + 8*i, U.getDouble(inputAddr + 8*i) +
U.getDouble(outputAddr + 8*i));
U.putDouble(outputAddr + 8*(i+1), U.getDouble(inputAddr +
8*(i+1)) + U.getDouble(outputAddr + 8*(i+1)));
U.putDouble(outputAddr + 8*(i+2), U.getDouble(inputAddr +
8*(i+2)) + U.getDouble(outputAddr + 8*(i+2)));
U.putDouble(outputAddr + 8*(i+3), U.getDouble(inputAddr +
8*(i+3)) + U.getDouble(outputAddr + 8*(i+3)));
}
}
static final VarHandle MH = MemoryHandles.varHandle(double.class,
ByteOrder.nativeOrder());
@Benchmark
public void scalarSegment(Data state) {
final MemoryAddress ia = state.inputMA;
final MemoryAddress oa = state.outputMA;
for(int i = 0; i < SIZE; i++) {
MH.set(oa.addOffset(8*i),
(double) MH.get(ia.addOffset(8*i)) +
(double) MH.get(oa.addOffset(8*i)));
}
}
@Benchmark
public void scalarSegmentUnrolled(Data state) {
final MemoryAddress ia = state.inputMA;
final MemoryAddress oa = state.outputMA;
for(int i = 0; i < SIZE; i+=4) {
MH.set(oa.addOffset(8*i),
(double) MH.get(ia.addOffset(8*i)) +
(double) MH.get(oa.addOffset(8*i)));
MH.set(oa.addOffset(8*(i+1)),
(double) MH.get(ia.addOffset(8*(i+1))) +
(double) MH.get(oa.addOffset(8*(i+1))));
MH.set(oa.addOffset(8*(i+2)),
(double) MH.get(ia.addOffset(8*(i+2))) +
(double) MH.get(oa.addOffset(8*(i+2))));
MH.set(oa.addOffset(8*(i+3)),
(double) MH.get(ia.addOffset(8*(i+3))) +
(double) MH.get(oa.addOffset(8*(i+3))));
}
}
}
More information about the panama-dev
mailing list