Vector API performance variation with arrays, byte arrays or byte buffers

Antoine Chambille ach at activeviam.com
Tue Mar 10 14:51:54 UTC 2020


Hi folks,

First, the new Vector API is -awesome- and it makes Java the best language
for writing data parallel algorithms, a remarkable turnaround. It reminds
me of when Java 5 became the best language for concurrent programming.

I'm benchmarking a use case where you aggregate element wise an array of
doubles into another array of doubles ( ai += bi for each coordinate ).
There are large performance variations depending on whether the data is
held in arrays, byte arrays or byte buffers. Disabling bounds checking
removes some of the overhead but not all. I'm sharing the JMH
microbenchmark below if that can help.



Here are the results of running the benchmark on my laptop with Windows 10
and an Intel core i9-8950HK @2.90GHz


-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=2

Benchmark                  Mode  Cnt        Score        Error  Units
standardArrayArray        thrpt    5  4657680.731 ±  22775.673  ops/s
standardArrayBuffer       thrpt    5  1074170.758 ±  28116.666  ops/s
standardBufferArray       thrpt    5  1066531.757 ±  39990.913  ops/s
standardBufferBuffer      thrpt    5   801500.523 ±  19984.247  ops/s
vectorArrayArray          thrpt    5  7107822.743 ± 454478.273  ops/s
vectorArrayBuffer         thrpt    5  1922263.407 ±  29921.036  ops/s
vectorBufferArray         thrpt    5  2732335.558 ±  81958.886  ops/s
vectorBufferBuffer        thrpt    5  1833276.409 ±  59682.441  ops/s
vectorByteArrayByteArray  thrpt    5  4618267.357 ± 127141.691  ops/s



-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0

Benchmark                  Mode  Cnt        Score        Error  Units
standardArrayArray        thrpt    5  4692286.894 ±  67785.058  ops/s
standardArrayBuffer       thrpt    5  1073420.025 ±  28216.922  ops/s
standardBufferArray       thrpt    5  1066385.323 ±  15700.653  ops/s
standardBufferBuffer      thrpt    5   797741.269 ±  15881.590  ops/s
vectorArrayArray          thrpt    5  8351594.873 ± 153608.251  ops/s
vectorArrayBuffer         thrpt    5  3107638.739 ± 223093.281  ops/s
vectorBufferArray         thrpt    5  3653867.093 ±  75307.265  ops/s
vectorBufferBuffer        thrpt    5  2224031.876 ±  49263.778  ops/s
vectorByteArrayByteArray  thrpt    5  4761018.920 ± 264243.227  ops/s



cheers,
-Antoine








package com.activeviam;

import jdk.incubator.vector.DoubleVector;
import jdk.incubator.vector.VectorSpecies;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

/**
 * Benchmark the element wise aggregation of an array
 * of doubles into another array of doubles, using
 * combinations of  java arrays, byte buffers, standard java code
 * and the new Vector API.
 */
public class AggregationBenchmark {

    /** Manually launch JMH */
    public static void main(String[] params) throws Exception {
        Options opt = new OptionsBuilder()
            .include(AggregationBenchmark.class.getSimpleName())
            .forks(1)
            .build();

        new Runner(opt).run();
    }


    @State(Scope.Benchmark)
    public static class Data {
        final static int SIZE = 1024;
        final double[] inputArray;
        final double[] outputArray;
        final byte[] inputByteArray;
        final byte[] outputByteArray;
        final ByteBuffer inputBuffer;
        final ByteBuffer outputBuffer;

        public Data() {
            this.inputArray = new double[SIZE];
            this.outputArray = new double[SIZE];
            this.inputByteArray = new byte[8 * SIZE];
            this.outputByteArray = new byte[8 * SIZE];
            this.inputBuffer = ByteBuffer.allocateDirect(8 * SIZE);
            this.outputBuffer = ByteBuffer.allocateDirect(8 * SIZE);
        }
    }

    @Benchmark
    public void standardArrayArray(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;
        for(int i = 0; i < input.length; i++) {
            output[i] += input[i];
        }
    }

    @Benchmark
    public void standardArrayBuffer(Data state) {
        final double[] input = state.inputArray;
        final ByteBuffer output = state.outputBuffer;
        for(int i = 0; i < input.length; i++) {
            output.putDouble(i << 3, output.getDouble(i << 3) + input[i]);
        }
    }

    @Benchmark
    public void standardBufferArray(Data state) {
        final ByteBuffer input = state.inputBuffer;
        final double[] output = state.outputArray;
        for(int i = 0; i < input.capacity(); i+=8) {
            output[i >>> 3] += input.getDouble(i);
        }
    }

    @Benchmark
    public void standardBufferBuffer(Data state) {
        final ByteBuffer input = state.inputBuffer;
        final ByteBuffer output = state.outputBuffer;
        for(int i = 0; i < input.capacity(); i+=8) {
            output.putDouble(i, output.getDouble(i) + input.getDouble(i));
        }
    }


    final static VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_MAX;

    @Benchmark
    public void vectorArrayArray(Data state) {
        final double[] input = state.inputArray;
        final double[] output = state.outputArray;

        for (int i = 0; i < input.length; i+=SPECIES.length()) {
            DoubleVector a = DoubleVector.fromArray(SPECIES, input, i);
            DoubleVector b = DoubleVector.fromArray(SPECIES, output, i);
            a = a.add(b);
            a.intoArray(output, i);
        }
    }

    @Benchmark
    public void vectorByteArrayByteArray(Data state) {
        final byte[] input = state.inputByteArray;
        final byte[] output = state.outputByteArray;

        for (int i = 0; i < input.length; i += 8 * SPECIES.length()) {
            DoubleVector a = DoubleVector.fromByteArray(SPECIES, input, i);
            DoubleVector b = DoubleVector.fromByteArray(SPECIES, output, i);
            a = a.add(b);
            a.intoByteArray(output, i);
        }
    }

    @Benchmark
    public void vectorBufferBuffer(Data state) {
        final ByteBuffer input = state.inputBuffer;
        final ByteBuffer output = state.outputBuffer;
        for (int i = 0; i < input.capacity(); i += 8 * SPECIES.length()) {
            DoubleVector a = DoubleVector.fromByteBuffer(SPECIES, input, i,
ByteOrder.nativeOrder());
            DoubleVector b = DoubleVector.fromByteBuffer(SPECIES, output,
i, ByteOrder.nativeOrder());
            a = a.add(b);
            a.intoByteBuffer(output, i, ByteOrder.nativeOrder());
        }
    }

    @Benchmark
    public void vectorArrayBuffer(Data state) {
        final double[] input = state.inputArray;
        final ByteBuffer output = state.outputBuffer;

        for (int i = 0; i < input.length; i+=SPECIES.length()) {
            DoubleVector a = DoubleVector.fromArray(SPECIES, input, i);
            DoubleVector b = DoubleVector.fromByteBuffer(SPECIES, output, i
<< 3, ByteOrder.nativeOrder());
            a = a.add(b);
            a.intoByteBuffer(output, i << 3, ByteOrder.nativeOrder());
        }
    }

    @Benchmark
    public void vectorBufferArray(Data state) {
        final ByteBuffer input = state.inputBuffer;
        final double[] output = state.outputArray;
        for (int i = 0; i < input.capacity(); i += 8 * SPECIES.length()) {
            DoubleVector a = DoubleVector.fromByteBuffer(SPECIES, input, i,
ByteOrder.nativeOrder());
            DoubleVector b = DoubleVector.fromArray(SPECIES, output, i >>>
3);
            a = a.add(b);
            a.intoArray(output, i >>> 3);
        }
    }

}


More information about the panama-dev mailing list