[vectorIntrinsics] C2 is fragile
Eugene Kluchnikov
eustas.ru at gmail.com
Sun Mar 14 22:58:06 UTC 2021
I observe unexpected behaviour - changing of implementation details in one
method causes great performance issues in other (independent) method.
Similar things seem to happen when sampling-profiling-agent is activated.
Example begin >>>
private static final VectorSpecies<Float> VFP = FloatVector.SPECIES_256;
private static final VectorSpecies<Integer> VIP = IntVector.SPECIES_256;
private static final VectorSpecies<Integer> VI4 = IntVector.SPECIES_128;
static final int STEP = VFP.length();
static void sumAbs(int[] sum, int count, int[] regionX, int[] dst) {
if (count > regionX.length) return;
IntVector acc = IntVector.fromArray(VI4, sum, regionX[0] * 4);
for (int i = 1; i < count; i++) {
acc = acc.add(IntVector.fromArray(VI4, sum, regionX[i] * 4));
}
acc.intoArray(dst, 0);
}
private static int MAX_INT = (1 << 23) - 1;
private static IntVector INTEGER_MASK = IntVector.broadcast(VIP, MAX_INT);
private static FloatVector IMPLICIT_ONE = FloatVector.broadcast(VFP,
MAX_INT + 1);
// x >= (d - y * ny) / nx
static void updateGeGeneric(int angle, int d, float[] regionY, float[]
regionX0f,
float[] regionX1f, int[] rowOffset, int[] regionX, int count, int kappa) {
FloatVector mNyNx = FloatVector.broadcast(VFP, SinCos.MINUS_COT[angle]);
FloatVector dNx = FloatVector.broadcast(VFP, (float)(d * SinCos.INV_SIN[
angle]));
FloatVector k = FloatVector.broadcast(VFP, kappa);
for (int i = 0; i < count; i += STEP) {
FloatVector y = FloatVector.fromArray(VFP, regionY, i);
FloatVector x0 = FloatVector.fromArray(VFP, regionX0f, i);
FloatVector x1 = FloatVector.fromArray(VFP, regionX1f, i);
// BAD
IntVector off = IntVector.fromArray(VIP, rowOffset, i);
FloatVector x = y.fma(mNyNx, dNx).max(x0).min(x1);
IntVector xi = x.add(IMPLICIT_ONE).viewAsIntegralLanes().and(INTEGER_MASK);
IntVector xOff = xi.add(off);
xOff.intoArray(regionX, i);
// GOOD
//FloatVector x = y.fma(mNyNx, dNx).max(x0).min(x1);
//FloatVector xOff = y.fma(k, x);
//xOff.add(IMPLICIT_ONE).viewAsIntegralLanes().and(INTEGER_MASK).intoArray(regionX,
i);
}
}
<<< Example end
Depending on alternative chosen in method "updateGeGeneric" (BAD or GOOD)
the generated code (hot loop) for other method "sumAbs" looks quite
different:
GOOD
0x000000012396a380: mov 0x10(%rcx,%rdi,4),%eax
0x000000012396a384: shl $0x2,%eax
0x000000012396a387: cmp %ebx,%eax
0x000000012396a389: jae 0x000000012396a45b
0x000000012396a38f: mov 0x14(%rcx,%rdi,4),%r10d ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000012396a394: vpaddd 0x10(%rsi,%rax,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
0x000000012396a39a: shl $0x2,%r10d
0x000000012396a39e: xchg %ax,%ax
0x000000012396a3a0: cmp %ebx,%r10d
0x000000012396a3a3: jae 0x000000012396a462
0x000000012396a3a9: mov 0x18(%rcx,%rdi,4),%eax ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000012396a3ad: vpaddd 0x10(%rsi,%r10,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
0x000000012396a3b4: shl $0x2,%eax
0x000000012396a3b7: cmp %ebx,%eax
0x000000012396a3b9: jae 0x000000012396a458
0x000000012396a3bf: mov 0x1c(%rcx,%rdi,4),%r10d ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000012396a3c4: vpaddd 0x10(%rsi,%rax,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
0x000000012396a3ca: shl $0x2,%r10d
0x000000012396a3ce: cmp %ebx,%r10d
0x000000012396a3d1: jae 0x000000012396a466 ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000012396a3d7: vpaddd 0x10(%rsi,%r10,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
0x000000012396a3de: add $0x4,%edi
0x000000012396a3e1: cmp %ebp,%edi
0x000000012396a3e3: jl 0x000000012396a380
;
------------------------------------------------------------------------------------------------------------------------------------
BAD
0x000000011a9b1e30: vmovdqu %xmm0,%xmm1
0x000000011a9b1e34: mov 0x10(%r8,%rbp,4),%r11d ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000011a9b1e39: shl $0x2,%r11d
0x000000011a9b1e3d: data16 xchg %ax,%ax
0x000000011a9b1e40: cmp %r9d,%r11d
0x000000011a9b1e43: jae 0x000000011a9b20a1 ;*goto
{reexecute=0 rethrow=0 return_oop=0}
; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
0x000000011a9b1e49: mov 0x130(%r15),%rdx
0x000000011a9b1e50: mov %rdx,%rdi
0x000000011a9b1e53: add $0x20,%rdi ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000011a9b1e57: vmovdqu 0x10(%rcx,%r11,4),%xmm2
0x000000011a9b1e5e: xchg %ax,%ax
0x000000011a9b1e60: cmp 0x140(%r15),%rdi
0x000000011a9b1e67: jae 0x000000011a9b1fb0 ;*goto
{reexecute=0 rethrow=0 return_oop=0}
; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
0x000000011a9b1e6d: mov %rdi,0x130(%r15)
0x000000011a9b1e74: prefetchw 0xc0(%rdi)
0x000000011a9b1e7b: movq $0x1,(%rdx)
0x000000011a9b1e82: prefetchw 0x100(%rdi)
0x000000011a9b1e89: movl $0x1165,0x8(%rdx) ;
{metadata({type array int})}
0x000000011a9b1e90: prefetchw 0x140(%rdi)
0x000000011a9b1e97: movl $0x4,0xc(%rdx)
0x000000011a9b1e9e: prefetchw 0x180(%rdi)
0x000000011a9b1ea5: mov %r12,0x10(%rdx)
0x000000011a9b1ea9: mov %r12,0x18(%rdx) ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000011a9b1ead: vpaddd %xmm2,%xmm1,%xmm0 ;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
0x000000011a9b1eb1: vmovdqu %xmm0,0x10(%rdx)
0x000000011a9b1eb6: mov 0x130(%r15),%rax
0x000000011a9b1ebd: mov %rax,%r11
0x000000011a9b1ec0: add $0x10,%r11
0x000000011a9b1ec4: cmp 0x140(%r15),%r11
0x000000011a9b1ecb: jae 0x000000011a9b1f45 ;*goto
{reexecute=0 rethrow=0 return_oop=0}
; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
0x000000011a9b1ed1: mov %r11,0x130(%r15)
0x000000011a9b1ed8: prefetchw 0xc0(%r11)
0x000000011a9b1ee0: movq $0x1,(%rax)
0x000000011a9b1ee7: movl $0x18e287,0x8(%rax) ;
{metadata('jdk/incubator/vector/Int128Vector')}
0x000000011a9b1eee: mov %rdx,%r11
0x000000011a9b1ef1: shr $0x3,%r11
0x000000011a9b1ef5: mov %r11d,0xc(%rax) ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
0x000000011a9b1ef9: inc %ebp
0x000000011a9b1efb: cmp %ebx,%ebp
0x000000011a9b1efd: data16 xchg %ax,%ax
0x000000011a9b1f00: jl 0x000000011a9b1e30 ;*goto
{reexecute=0 rethrow=0 return_oop=0}
; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
Best regards,
Eugene.
More information about the panama-dev
mailing list