Vector API performance variation with arrays, byte arrays or byte buffers
Antoine Chambille
ach at activeviam.com
Tue Mar 10 14:51:54 UTC 2020
Hi folks,
First, the new Vector API is -awesome- and it makes Java the best language
for writing data parallel algorithms, a remarkable turnaround. It reminds
me of when Java 5 became the best language for concurrent programming.
I'm benchmarking a use case where you aggregate element wise an array of
doubles into another array of doubles ( ai += bi for each coordinate ).
There are large performance variations depending on whether the data is
held in arrays, byte arrays or byte buffers. Disabling bounds checking
removes some of the overhead but not all. I'm sharing the JMH
microbenchmark below if that can help.
Here are the results of running the benchmark on my laptop with Windows 10
and an Intel core i9-8950HK @2.90GHz
-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=2
Benchmark Mode Cnt Score Error Units
standardArrayArray thrpt 5 4657680.731 ± 22775.673 ops/s
standardArrayBuffer thrpt 5 1074170.758 ± 28116.666 ops/s
standardBufferArray thrpt 5 1066531.757 ± 39990.913 ops/s
standardBufferBuffer thrpt 5 801500.523 ± 19984.247 ops/s
vectorArrayArray thrpt 5 7107822.743 ± 454478.273 ops/s
vectorArrayBuffer thrpt 5 1922263.407 ± 29921.036 ops/s
vectorBufferArray thrpt 5 2732335.558 ± 81958.886 ops/s
vectorBufferBuffer thrpt 5 1833276.409 ± 59682.441 ops/s
vectorByteArrayByteArray thrpt 5 4618267.357 ± 127141.691 ops/s
-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0
Benchmark Mode Cnt Score Error Units
standardArrayArray thrpt 5 4692286.894 ± 67785.058 ops/s
standardArrayBuffer thrpt 5 1073420.025 ± 28216.922 ops/s
standardBufferArray thrpt 5 1066385.323 ± 15700.653 ops/s
standardBufferBuffer thrpt 5 797741.269 ± 15881.590 ops/s
vectorArrayArray thrpt 5 8351594.873 ± 153608.251 ops/s
vectorArrayBuffer thrpt 5 3107638.739 ± 223093.281 ops/s
vectorBufferArray thrpt 5 3653867.093 ± 75307.265 ops/s
vectorBufferBuffer thrpt 5 2224031.876 ± 49263.778 ops/s
vectorByteArrayByteArray thrpt 5 4761018.920 ± 264243.227 ops/s
cheers,
-Antoine
package com.activeviam;
import jdk.incubator.vector.DoubleVector;
import jdk.incubator.vector.VectorSpecies;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* Benchmark the element wise aggregation of an array
* of doubles into another array of doubles, using
* combinations of java arrays, byte buffers, standard java code
* and the new Vector API.
*/
public class AggregationBenchmark {
/** Manually launch JMH */
public static void main(String[] params) throws Exception {
Options opt = new OptionsBuilder()
.include(AggregationBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}
@State(Scope.Benchmark)
public static class Data {
final static int SIZE = 1024;
final double[] inputArray;
final double[] outputArray;
final byte[] inputByteArray;
final byte[] outputByteArray;
final ByteBuffer inputBuffer;
final ByteBuffer outputBuffer;
public Data() {
this.inputArray = new double[SIZE];
this.outputArray = new double[SIZE];
this.inputByteArray = new byte[8 * SIZE];
this.outputByteArray = new byte[8 * SIZE];
this.inputBuffer = ByteBuffer.allocateDirect(8 * SIZE);
this.outputBuffer = ByteBuffer.allocateDirect(8 * SIZE);
}
}
@Benchmark
public void standardArrayArray(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for(int i = 0; i < input.length; i++) {
output[i] += input[i];
}
}
@Benchmark
public void standardArrayBuffer(Data state) {
final double[] input = state.inputArray;
final ByteBuffer output = state.outputBuffer;
for(int i = 0; i < input.length; i++) {
output.putDouble(i << 3, output.getDouble(i << 3) + input[i]);
}
}
@Benchmark
public void standardBufferArray(Data state) {
final ByteBuffer input = state.inputBuffer;
final double[] output = state.outputArray;
for(int i = 0; i < input.capacity(); i+=8) {
output[i >>> 3] += input.getDouble(i);
}
}
@Benchmark
public void standardBufferBuffer(Data state) {
final ByteBuffer input = state.inputBuffer;
final ByteBuffer output = state.outputBuffer;
for(int i = 0; i < input.capacity(); i+=8) {
output.putDouble(i, output.getDouble(i) + input.getDouble(i));
}
}
final static VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_MAX;
@Benchmark
public void vectorArrayArray(Data state) {
final double[] input = state.inputArray;
final double[] output = state.outputArray;
for (int i = 0; i < input.length; i+=SPECIES.length()) {
DoubleVector a = DoubleVector.fromArray(SPECIES, input, i);
DoubleVector b = DoubleVector.fromArray(SPECIES, output, i);
a = a.add(b);
a.intoArray(output, i);
}
}
@Benchmark
public void vectorByteArrayByteArray(Data state) {
final byte[] input = state.inputByteArray;
final byte[] output = state.outputByteArray;
for (int i = 0; i < input.length; i += 8 * SPECIES.length()) {
DoubleVector a = DoubleVector.fromByteArray(SPECIES, input, i);
DoubleVector b = DoubleVector.fromByteArray(SPECIES, output, i);
a = a.add(b);
a.intoByteArray(output, i);
}
}
@Benchmark
public void vectorBufferBuffer(Data state) {
final ByteBuffer input = state.inputBuffer;
final ByteBuffer output = state.outputBuffer;
for (int i = 0; i < input.capacity(); i += 8 * SPECIES.length()) {
DoubleVector a = DoubleVector.fromByteBuffer(SPECIES, input, i,
ByteOrder.nativeOrder());
DoubleVector b = DoubleVector.fromByteBuffer(SPECIES, output,
i, ByteOrder.nativeOrder());
a = a.add(b);
a.intoByteBuffer(output, i, ByteOrder.nativeOrder());
}
}
@Benchmark
public void vectorArrayBuffer(Data state) {
final double[] input = state.inputArray;
final ByteBuffer output = state.outputBuffer;
for (int i = 0; i < input.length; i+=SPECIES.length()) {
DoubleVector a = DoubleVector.fromArray(SPECIES, input, i);
DoubleVector b = DoubleVector.fromByteBuffer(SPECIES, output, i
<< 3, ByteOrder.nativeOrder());
a = a.add(b);
a.intoByteBuffer(output, i << 3, ByteOrder.nativeOrder());
}
}
@Benchmark
public void vectorBufferArray(Data state) {
final ByteBuffer input = state.inputBuffer;
final double[] output = state.outputArray;
for (int i = 0; i < input.capacity(); i += 8 * SPECIES.length()) {
DoubleVector a = DoubleVector.fromByteBuffer(SPECIES, input, i,
ByteOrder.nativeOrder());
DoubleVector b = DoubleVector.fromArray(SPECIES, output, i >>>
3);
a = a.add(b);
a.intoArray(output, i >>> 3);
}
}
}
More information about the panama-dev
mailing list