Calculating integral images / cumulative add
Paul Sandoz
paul.sandoz at oracle.com
Wed Jan 6 17:52:35 UTC 2021
Hi Stefan,
No current plans for a scan (or prefix sum). You would have to roll your own, composed from other methods [1], which I think would be a good experiment and help inform what we could add to the API/implementation (Note: support for masks are not yet optimal, esp. on AVX512).
FWIW the closest I have come to loops with some form of loop-based data dependency is vectorizing Java’s array hash code implementation. I’ll share that [2] just in case you are interested (I plan to update the relevant code in the repository to that of below.)
Paul.
[1] See Lemire’s work
https://lemire.me/blog/2017/09/27/stream-vbyte-breaking-new-speed-records-for-integer-compression/ <https://lemire.me/blog/2017/09/27/stream-vbyte-breaking-new-speed-records-for-integer-compression/>
[2]
package jmh;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector", "-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=2"})
public class TestHashcode {
static final VectorSpecies<Integer> INT_256_SPECIES = IntVector.SPECIES_256;
static final VectorSpecies<Byte> BYTE_64_SPECIES = ByteVector.SPECIES_64;
static final VectorSpecies<Byte> BYTE_128_SPECIES = ByteVector.SPECIES_128;
static final VectorSpecies<Byte> BYTE_256_SPECIES = ByteVector.SPECIES_256;
static final int COEFF_31_TO_8;
static final int COEFF_31_TO_16;
static final int COEFF_31_TO_24;
static final int COEFF_31_TO_32;
static final IntVector H_COEFF_31_TO_8;
static final IntVector H_COEFF_31_TO_16;
static final IntVector H_COEFF_31_TO_24;
static final IntVector H_COEFF_31_TO_32;
static final IntVector H_COEFF_8;
static final IntVector H_COEFF_16;
static final IntVector H_COEFF_24;
static final IntVector H_COEFF_32;
static {
int[] a = new int[INT_256_SPECIES.length() * 4];
a[a.length - 1] = 1;
for (int i = 1; i < a.length; i++) {
a[a.length - 1 - i] = a[a.length - 1 - i + 1] * 31;
}
COEFF_31_TO_8 = a[24] * 31;
COEFF_31_TO_16 = a[16] * 31;
COEFF_31_TO_24 = a[8] * 31;
COEFF_31_TO_32 = a[0] * 31;
H_COEFF_31_TO_8 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_8);
H_COEFF_31_TO_16 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_16);
H_COEFF_31_TO_24 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_24);
H_COEFF_31_TO_32 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_32);
H_COEFF_8 = IntVector.fromArray(INT_256_SPECIES, a, 24);
H_COEFF_16 = IntVector.fromArray(INT_256_SPECIES, a, 16);
H_COEFF_24 = IntVector.fromArray(INT_256_SPECIES, a, 8);
H_COEFF_32 = IntVector.fromArray(INT_256_SPECIES, a, 0);
}
@Param("1024")
int size;
byte[] a;
@Setup
public void init() {
a = new byte[size];
ThreadLocalRandom.current().nextBytes(a);
}
@Benchmark
public int vector64ReduceInLoop() {
int h = 1;
int i = 0;
// Force into registers
IntVector c_H_COEFF_8 = H_COEFF_8.add(0);
for (; i < BYTE_64_SPECIES.loopBound(a.length); i += BYTE_64_SPECIES.length()) {
ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h = h * COEFF_31_TO_8 + x.mul(c_H_COEFF_8).reduceLanes(VectorOperators.ADD);
}
for (; i < a.length; i++) {
h = 31 * h + a[i];
}
return h;
}
@Benchmark
public int vector64() {
IntVector h = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
int i = 0;
// Force into registers
IntVector c_H_COEFF_8 = H_COEFF_8.add(0);
IntVector c_H_COEFF_31_TO_8 = H_COEFF_31_TO_8.add(0);
for (; i < BYTE_64_SPECIES.loopBound(a.length); i += BYTE_64_SPECIES.length()) {
ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h = h.mul(c_H_COEFF_31_TO_8).add(x.mul(c_H_COEFF_8));
}
int sh = h.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sh = 31 * sh + a[i];
}
return sh;
}
@Benchmark
public int vector64Unrolledx2() {
IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
IntVector h2 = IntVector.zero(INT_256_SPECIES);
int i = 0;
// Force into registers
IntVector c_H_COEFF_16 = H_COEFF_16.add(0);
IntVector c_H_COEFF_8 = H_COEFF_8.add(0);
IntVector c_H_COEFF_31_TO_16 = H_COEFF_31_TO_16.add(0);
for (; i < (a.length & ~(BYTE_128_SPECIES.length() - 1)); i += BYTE_128_SPECIES.length()) {
ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h1 = h1.mul(c_H_COEFF_31_TO_16).add(x.mul(c_H_COEFF_16));
b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length());
x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h2 = h2.mul(c_H_COEFF_31_TO_16).add(x.mul(c_H_COEFF_8));
}
int sh = h1.reduceLanes(VectorOperators.ADD) + h2.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sh = 31 * sh + a[i];
}
return sh;
}
@Benchmark
public int vector64Unrolledx4() {
IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
IntVector h2 = IntVector.zero(INT_256_SPECIES);
IntVector h3 = IntVector.zero(INT_256_SPECIES);
IntVector h4 = IntVector.zero(INT_256_SPECIES);
int i = 0;
// Force into registers
IntVector c_H_COEFF_8 = H_COEFF_8.add(0);
IntVector c_H_COEFF_16 = H_COEFF_16.add(0);
IntVector c_H_COEFF_24 = H_COEFF_24.add(0);
IntVector c_H_COEFF_32 = H_COEFF_32.add(0);
IntVector c_H_COEFF_31_TO_32 = H_COEFF_31_TO_32.add(0);
for (; i < (a.length & ~(BYTE_256_SPECIES.length() - 1)); i += BYTE_256_SPECIES.length()) {
ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h1 = h1.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_32));
b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length());
x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h2 = h2.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_24));
b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length() * 2);
x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h3 = h3.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_16));
b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length() * 3);
x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h4 = h4.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_8));
}
int sh = h1.reduceLanes(VectorOperators.ADD) +
h2.reduceLanes(VectorOperators.ADD) +
h3.reduceLanes(VectorOperators.ADD) +
h4.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sh = 31 * sh + a[i];
}
return sh;
}
@Benchmark
public int vector128Unrolledx2() {
IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
IntVector h2 = IntVector.fromArray(INT_256_SPECIES, new int[]{0, 0, 0, 0, 0, 0, 0, 0}, 0);
int i = 0;
// Force into registers
IntVector c_H_COEFF_16_P0 = H_COEFF_16.add(0);
IntVector c_H_COEFF_16_P1 = H_COEFF_8.add(0);
IntVector c_H_COEFF_31_TO_16 = H_COEFF_31_TO_16.add(0);
for (; i < (a.length & ~(BYTE_128_SPECIES.length() - 1)); i += BYTE_128_SPECIES.length()) {
ByteVector b = ByteVector.fromArray(BYTE_128_SPECIES, a, i);
IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h1 = h1.mul(c_H_COEFF_31_TO_16).add(x.mul(c_H_COEFF_16_P0));
x = (IntVector) b.castShape(INT_256_SPECIES, 1);
h2 = h2.mul(c_H_COEFF_31_TO_16).add(x.mul(c_H_COEFF_16_P1));
}
int sh = h1.reduceLanes(VectorOperators.ADD) + h2.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sh = 31 * sh + a[i];
}
return sh;
}
@Benchmark
public int vector256Unrolledx4() {
IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
IntVector h2 = IntVector.zero(INT_256_SPECIES);
IntVector h3 = IntVector.zero(INT_256_SPECIES);
IntVector h4 = IntVector.zero(INT_256_SPECIES);
int i = 0;
// Force into registers
IntVector c_H_COEFF_8 = H_COEFF_8.add(0);
IntVector c_H_COEFF_16 = H_COEFF_16.add(0);
IntVector c_H_COEFF_24 = H_COEFF_24.add(0);
IntVector c_H_COEFF_32 = H_COEFF_32.add(0);
IntVector c_H_COEFF_31_TO_32 = H_COEFF_31_TO_32.add(0);
for (; i < (a.length & ~(BYTE_256_SPECIES.length() - 1)); i += BYTE_256_SPECIES.length()) {
ByteVector b = ByteVector.fromArray(BYTE_256_SPECIES, a, i);
IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
h1 = h1.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_32));
x = (IntVector) b.castShape(INT_256_SPECIES, 1);
h2 = h2.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_24));
x = (IntVector) b.castShape(INT_256_SPECIES, 2);
h3 = h3.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_16));
x = (IntVector) b.castShape(INT_256_SPECIES, 3);
h4 = h4.mul(c_H_COEFF_31_TO_32).add(x.mul(c_H_COEFF_8));
}
int sh = h1.reduceLanes(VectorOperators.ADD) +
h2.reduceLanes(VectorOperators.ADD) +
h3.reduceLanes(VectorOperators.ADD) +
h4.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sh = 31 * sh + a[i];
}
return sh;
}
@Benchmark
public int scalar() {
return Arrays.hashCode(a);
}
@Benchmark
public int scalarUnrolled() {
if (a == null)
return 0;
int h = 1;
int i = 0;
for (; i < (a.length & ~(8 - 1)); i += 8) {
h = h * 31 * 31 * 31 * 31 * 31 * 31 * 31 * 31 +
a[i + 0] * 31 * 31 * 31 * 31 * 31 * 31 * 31 +
a[i + 1] * 31 * 31 * 31 * 31 * 31 * 31 +
a[i + 2] * 31 * 31 * 31 * 31 * 31 +
a[i + 3] * 31 * 31 * 31 * 31 +
a[i + 4] * 31 * 31 * 31 +
a[i + 5] * 31 * 31 +
a[i + 6] * 31 +
a[i + 7];
}
for (; i < a.length; i++) {
h = 31 * h + a[i];
}
return h;
}
}
> On Jan 4, 2021, at 5:51 PM, Stefan Reich <stefan.reich.maker.of.eye at googlemail.com> wrote:
>
> Oh I just saw this in the presentation
> <http://cr.openjdk.java.net/~jrose/pres/201907-Vectors.pdf>:
>
> ▪ Segmented scan (reduce with partials and mask-driven reset)
>
> That's probably what I want... so it's not there yet?
>
> On Tue, 5 Jan 2021 at 02:31, Stefan Reich <
> stefan.reich.maker.of.eye at googlemail.com> wrote:
>
>> Hi,
>>
>> I am trying to vectorize my routine that constructs an integral image / "summed-area
>> table" <https://en.wikipedia.org/wiki/Summed-area_table>.
>>
>> int i = 0;
>> for (int y = 0; y < h; y++) {
>> int sum = 0;
>> for (int x = 0; x < w; x++) {
>> sum += image[i];
>> data[i] = y > 0 ? sum + data[i-w] : sum;
>> i++;
>> }
>> }
>>
>> In order to use the Vector API, I'd need an operation that does a
>> cumulative summing over the lanes, e.g. for a 4-component vector:
>>
>> b[0] = a[0]
>> b[1] = a[0]+a[1]
>> b[2] = a[0]+a[1]+a[2]
>> b[3] = a[0]+a[1]+a[2]+a[3]
>>
>> I don't think this is available in the current Vector API, is it?
>>
>> Thanks,
>> Stefan
>>
>> --
>> Stefan Reich
>> BotCompany.de // Java-based operating systems
>>
>
>
> --
> Stefan Reich
> BotCompany.de // Java-based operating systems
More information about the panama-dev
mailing list