[vectorIntrinsics] C2 is fragile

Eugene Kluchnikov eustas.ru at gmail.com
Sun Mar 14 22:58:06 UTC 2021


I observe unexpected behaviour - changing of implementation details in one
method causes great performance issues in other (independent) method.

Similar things seem to happen when sampling-profiling-agent is activated.

Example begin >>>
private static final VectorSpecies<Float> VFP = FloatVector.SPECIES_256;
private static final VectorSpecies<Integer> VIP = IntVector.SPECIES_256;
private static final VectorSpecies<Integer> VI4 = IntVector.SPECIES_128;

static final int STEP = VFP.length();

static void sumAbs(int[] sum, int count, int[] regionX, int[] dst) {
if (count > regionX.length) return;
IntVector acc = IntVector.fromArray(VI4, sum, regionX[0] * 4);
for (int i = 1; i < count; i++) {
acc = acc.add(IntVector.fromArray(VI4, sum, regionX[i] * 4));
}
acc.intoArray(dst, 0);
}

private static int MAX_INT = (1 << 23) - 1;
private static IntVector INTEGER_MASK = IntVector.broadcast(VIP, MAX_INT);
private static FloatVector IMPLICIT_ONE = FloatVector.broadcast(VFP,
MAX_INT + 1);

// x >= (d - y * ny) / nx
static void updateGeGeneric(int angle, int d, float[] regionY, float[]
regionX0f,
float[] regionX1f, int[] rowOffset, int[] regionX, int count, int kappa) {
FloatVector mNyNx = FloatVector.broadcast(VFP, SinCos.MINUS_COT[angle]);
FloatVector dNx = FloatVector.broadcast(VFP, (float)(d * SinCos.INV_SIN[
angle]));
FloatVector k = FloatVector.broadcast(VFP, kappa);
for (int i = 0; i < count; i += STEP) {
FloatVector y = FloatVector.fromArray(VFP, regionY, i);
FloatVector x0 = FloatVector.fromArray(VFP, regionX0f, i);
FloatVector x1 = FloatVector.fromArray(VFP, regionX1f, i);

// BAD
IntVector off = IntVector.fromArray(VIP, rowOffset, i);
FloatVector x = y.fma(mNyNx, dNx).max(x0).min(x1);
IntVector xi = x.add(IMPLICIT_ONE).viewAsIntegralLanes().and(INTEGER_MASK);
IntVector xOff = xi.add(off);
xOff.intoArray(regionX, i);

// GOOD
//FloatVector x = y.fma(mNyNx, dNx).max(x0).min(x1);
//FloatVector xOff = y.fma(k, x);
//xOff.add(IMPLICIT_ONE).viewAsIntegralLanes().and(INTEGER_MASK).intoArray(regionX,
i);
}
}
<<< Example end

Depending on alternative chosen in method "updateGeGeneric" (BAD or GOOD)
the generated code (hot loop) for other method "sumAbs" looks quite
different:

GOOD

  0x000000012396a380:   mov    0x10(%rcx,%rdi,4),%eax
  0x000000012396a384:   shl    $0x2,%eax
  0x000000012396a387:   cmp    %ebx,%eax
  0x000000012396a389:   jae    0x000000012396a45b
  0x000000012396a38f:   mov    0x14(%rcx,%rdi,4),%r10d      ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000012396a394:   vpaddd 0x10(%rsi,%rax,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
                                                            ; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
  0x000000012396a39a:   shl    $0x2,%r10d
  0x000000012396a39e:   xchg   %ax,%ax
  0x000000012396a3a0:   cmp    %ebx,%r10d
  0x000000012396a3a3:   jae    0x000000012396a462
  0x000000012396a3a9:   mov    0x18(%rcx,%rdi,4),%eax       ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000012396a3ad:   vpaddd 0x10(%rsi,%r10,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
                                                            ; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
  0x000000012396a3b4:   shl    $0x2,%eax
  0x000000012396a3b7:   cmp    %ebx,%eax
  0x000000012396a3b9:   jae    0x000000012396a458
  0x000000012396a3bf:   mov    0x1c(%rcx,%rdi,4),%r10d      ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000012396a3c4:   vpaddd 0x10(%rsi,%rax,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
                                                            ; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
  0x000000012396a3ca:   shl    $0x2,%r10d
  0x000000012396a3ce:   cmp    %ebx,%r10d
  0x000000012396a3d1:   jae    0x000000012396a466           ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000012396a3d7:   vpaddd 0x10(%rsi,%r10,4),%xmm0,%xmm0;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
                                                            ; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
  0x000000012396a3de:   add    $0x4,%edi
  0x000000012396a3e1:   cmp    %ebp,%edi
  0x000000012396a3e3:   jl     0x000000012396a380

;
------------------------------------------------------------------------------------------------------------------------------------

BAD

  0x000000011a9b1e30:   vmovdqu %xmm0,%xmm1
  0x000000011a9b1e34:   mov    0x10(%r8,%rbp,4),%r11d       ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000011a9b1e39:   shl    $0x2,%r11d
  0x000000011a9b1e3d:   data16 xchg %ax,%ax
  0x000000011a9b1e40:   cmp    %r9d,%r11d
  0x000000011a9b1e43:   jae    0x000000011a9b20a1           ;*goto
{reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
  0x000000011a9b1e49:   mov    0x130(%r15),%rdx
  0x000000011a9b1e50:   mov    %rdx,%rdi
  0x000000011a9b1e53:   add    $0x20,%rdi                   ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000011a9b1e57:   vmovdqu 0x10(%rcx,%r11,4),%xmm2
  0x000000011a9b1e5e:   xchg   %ax,%ax
  0x000000011a9b1e60:   cmp    0x140(%r15),%rdi
  0x000000011a9b1e67:   jae    0x000000011a9b1fb0           ;*goto
{reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
  0x000000011a9b1e6d:   mov    %rdi,0x130(%r15)
  0x000000011a9b1e74:   prefetchw 0xc0(%rdi)
  0x000000011a9b1e7b:   movq   $0x1,(%rdx)
  0x000000011a9b1e82:   prefetchw 0x100(%rdi)
  0x000000011a9b1e89:   movl   $0x1165,0x8(%rdx)            ;
{metadata({type array int})}
  0x000000011a9b1e90:   prefetchw 0x140(%rdi)
  0x000000011a9b1e97:   movl   $0x4,0xc(%rdx)
  0x000000011a9b1e9e:   prefetchw 0x180(%rdi)
  0x000000011a9b1ea5:   mov    %r12,0x10(%rdx)
  0x000000011a9b1ea9:   mov    %r12,0x18(%rdx)              ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000011a9b1ead:   vpaddd %xmm2,%xmm1,%xmm0            ;*invokestatic
binaryOp {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 633)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 279)
                                                            ; -
jdk.incubator.vector.Int128Vector::lanewise at 3 (line 41)
                                                            ; -
jdk.incubator.vector.IntVector::add at 5 (line 1096)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 45
  0x000000011a9b1eb1:   vmovdqu %xmm0,0x10(%rdx)
  0x000000011a9b1eb6:   mov    0x130(%r15),%rax
  0x000000011a9b1ebd:   mov    %rax,%r11
  0x000000011a9b1ec0:   add    $0x10,%r11
  0x000000011a9b1ec4:   cmp    0x140(%r15),%r11
  0x000000011a9b1ecb:   jae    0x000000011a9b1f45           ;*goto
{reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 53
  0x000000011a9b1ed1:   mov    %r11,0x130(%r15)
  0x000000011a9b1ed8:   prefetchw 0xc0(%r11)
  0x000000011a9b1ee0:   movq   $0x1,(%rax)
  0x000000011a9b1ee7:   movl   $0x18e287,0x8(%rax)          ;
{metadata('jdk/incubator/vector/Int128Vector')}
  0x000000011a9b1eee:   mov    %rdx,%r11
  0x000000011a9b1ef1:   shr    $0x3,%r11
  0x000000011a9b1ef5:   mov    %r11d,0xc(%rax)              ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
jdk.incubator.vector.IntVector::intoArray at 42 (line 2962)
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 60
  0x000000011a9b1ef9:   inc    %ebp
  0x000000011a9b1efb:   cmp    %ebx,%ebp
  0x000000011a9b1efd:   data16 xchg %ax,%ax
  0x000000011a9b1f00:   jl     0x000000011a9b1e30           ;*goto
{reexecute=0 rethrow=0 return_oop=0}
                                                            ; -
ru.eustas.twim.EncoderSimd::sumAbs at 53

Best regards,
  Eugene.


More information about the panama-dev mailing list