IntVector.fromValues is not optimized away ?

Mon May 11 21:42:25 UTC 2020

Thanks, very interesting.  Sandhya and her colleagues are better qualified than I to comment accurately as to current behavior.

It does give me hope there is a better way out for fromValues, and in fact we could do that in the template code at least for fixed sizes.  But, as the vector lane count gets larger though the less useful it may become.

FWIW I can reproduce the reduction issue:

  var v1 = IntVector.fromArray(IntVector.SPECIES_64, ia, 0);
  return v1.reduceLanes(VectorOperators.XOR);

  0.07%  │  0x000000010b929d8f:   vmovq  0x10(%r12,%r10,8),%xmm0
  0.24%  │  0x000000010b929d96:   xor    %r11d,%r11d
  7.16%  │  0x000000010b929d99:   vpshufd $0x1,%xmm0,%xmm2
  0.51%  │  0x000000010b929d9e:   vpxor  %xmm0,%xmm2,%xmm2
  0.39%  │  0x000000010b929da2:   vmovd  %r11d,%xmm1
  0.16%  │  0x000000010b929da7:   vpxor  %xmm1,%xmm2,%xmm2
  7.64%  │  0x000000010b929dab:   vmovd  %xmm2,%edx

I think it's bug in the code gen unnecessarily applying the identity value for the last stage of the reduction (I observe the same for & and + operations)

Paul.

> On May 11, 2020, at 1:35 PM, forax at univ-mlv.fr wrote:
> 
> I tried several different snippets with more or less success and found several other things that should be fixed.
> 
> Option 1
>   var zero = (IntVector)IntVector.SPECIES_64.zero(); 
>   var v1 = zero.withLane(0, i1).withLane(1, i3); 
>   var v2 = zero.withLane(0, i2).withLane(1, i4); 
>   var result = v1.lanewise(VectorOperators.XOR, v2); 
>   return result.lane(0) ^ result.lane(1);
> 
> I get:
>   0x00007fb74c33693c:   mov    0x14(%rsi),%r11d 
>   0x00007fb74c336940:   mov    0x10(%rsi),%r10d 
>   0x00007fb74c336944:   mov    0x18(%rsi),%r9d 
>   0x00007fb74c336948:   mov    0xc(%rsi),%r8d 
>   0x00007fb74c33694c:   movabs $0x71963e868,%rcx            ;   {oop([I{0x000000071963e868})} 
>   0x00007fb74c336956:   vmovq  0x10(%rcx),%xmm0 
>   0x00007fb74c33695b:   vmovdqu %xmm0,%xmm1 
>   0x00007fb74c33695f:   vpinsrd $0x0,%r8d,%xmm1,%xmm1 
>   0x00007fb74c336965:   vpinsrd $0x0,%r10d,%xmm0,%xmm0 
>   0x00007fb74c33696b:   vpinsrd $0x1,%r11d,%xmm1,%xmm1 
>   0x00007fb74c336971:   vpinsrd $0x1,%r9d,%xmm0,%xmm0       ;*invokestatic extract {reexecute=0 rethrow=0 return_oop=0} 
>                                                             ; - jdk.incubator.vector.Int64Vector::laneHelper at 16 (line 482) 
>                                                             ; - jdk.incubator.vector.Int64Vector::lane at 36 (line 476) 
>                                                             ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 67 (line 19) 
>   0x00007fb74c336977:   vpxor  %xmm0,%xmm1,%xmm0            ;*invokestatic binaryOp {reexecute=0 rethrow=0 return_oop=0} 
>                                                             ; - jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 652) 
>                                                             ; - jdk.incubator.vector.Int64Vector::lanewise at 3 (line 277) 
>                                                             ; - jdk.incubator.vector.Int64Vector::lanewise at 3 (line 41) 
>                                                             ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 53 (line 18) 
>   0x00007fb74c33697b:   vmovd  %xmm0,%eax 
>   0x00007fb74c33697f:   vpextrd $0x1,%xmm0,%r10d 
>   0x00007fb74c336985:   xor    %r10d,%eax
> 
> which is not that bad, but at the same time, it seems HotSpot is not able to see that 0x10(%rcx) is zero ?
> 
> Option 2, 3 and 4:
>   using one of
>     var zero = IntVector.zero(SPECIES_64);
>     var zero = (IntVector)IntVector.SPECIES_64.broadcast(0); 
>     var zero = IntVector.broadcast(SPECIES_64, 0); 
>  all generate a code that calls the runtime for HotSpot. It's an intrinsic, my hardware doesn't seems to have an instruction for it so
>  a call to the HS runtime is generated which make it super inefficient.
> It's like if the backup java code of the instrinsics was not used (and not inlined).
> 
> So i get a code like:
> 0x00007fa57ff04f7f:   movabs $0x719645ea0,%rdi            ;   {oop(a 'jdk/incubator/vector/IntVector$$Lambda$64+0x0000000800b71550'{0x0000000719645ea0})} 
>   0x00007fa57ff04f89:   movabs $0x719632b80,%rsi            ;   {oop(a 'java/lang/Class'{0x0000000719632b80} = 'jdk/incubator/vector/Int64Vector')} 
>   0x00007fa57ff04f93:   movabs $0x7ffd002a0,%rdx            ;   {oop(a 'java/lang/Class'{0x00000007ffd002a0} = int)} 
>   0x00007fa57ff04f9d:   mov    $0x2,%ecx 
>   0x00007fa57ff04fa2:   xor    %r8d,%r8d 
>   0x00007fa57ff04fa5:   movabs $0x7196327c8,%r9             ;   {oop(a 'jdk/incubator/vector/IntVector$IntSpecies'{0x00000007196327c8})} 
>   0x00007fa57ff04faf:   callq  0x00007fa57843f400           ; ImmutableOopMap {rbp=Oop } 
>                                                             ;*invokestatic broadcastCoerced {reexecute=0 rethrow=0
> 
> Option 5:
>   Use re-interpret shape
>       var zero = (IntVector)IntVector.SPECIES_128.zero(); 
>       var v = zero.withLane(0, i1).withLane(1, i2).withLane(2, i3).withLane(3, i4); 
>       var v1 = (IntVector)v.reinterpretShape(IntVector.SPECIES_64, 0);     <-- here
>       var v2 = (IntVector)v.reinterpretShape(IntVector.SPECIES_64, 1);     <-- and here
>       var result = v1.lanewise(VectorOperators.XOR, v2); 
>       return result.lane(0) ^ result.lane(1);
> 
> Like with using fromValues, an array is still created and a bunch of weird code around VectorSupport$VectorPayload::getPayload
> 
> Option 6:
>   do the reduce using reduceLanes,
>       var zero = (IntVector)IntVector.SPECIES_64.zero(); 
>       var v1 = zero.withLane(0, i1).withLane(1, i3); 
>       var v2 = zero.withLane(0, i2).withLane(1, i4); 
>       var result = v1.lanewise(VectorOperators.XOR, v2); 
>       return result.reduceLanes(VectorOperators.XOR);     <-- here
> 
> I get:
> 0x00007f62db4e4f37:   mov    %rbp,0x10(%rsp)              ;*synchronization entry 
>                                                             ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at -1 (line 38) 
>   0x00007f62db4e4f3c:   mov    0x14(%rsi),%r11d 
>   0x00007f62db4e4f40:   mov    0x10(%rsi),%r10d 
>   0x00007f62db4e4f44:   mov    0x18(%rsi),%r9d 
>   0x00007f62db4e4f48:   mov    0xc(%rsi),%r8d 
>   0x00007f62db4e4f4c:   movabs $0x71963e868,%rcx            ;   {oop([I{0x000000071963e868})} 
>   0x00007f62db4e4f56:   vmovq  0x10(%rcx),%xmm0 
>   0x00007f62db4e4f5b:   vmovdqu %xmm0,%xmm1 
>   0x00007f62db4e4f5f:   vpinsrd $0x0,%r8d,%xmm1,%xmm1 
>   0x00007f62db4e4f65:   vpinsrd $0x0,%r10d,%xmm0,%xmm0 
>   0x00007f62db4e4f6b:   vpinsrd $0x1,%r11d,%xmm1,%xmm1 
>   0x00007f62db4e4f71:   vpinsrd $0x1,%r9d,%xmm0,%xmm0 
>   0x00007f62db4e4f77:   vpxor  %xmm0,%xmm1,%xmm0 
>   0x00007f62db4e4f7b:   xor    %r11d,%r11d 
>   0x00007f62db4e4f7e:   vpshufd $0x1,%xmm0,%xmm2 
>   0x00007f62db4e4f83:   vpxor  %xmm0,%xmm2,%xmm2 
>   0x00007f62db4e4f87:   vmovd  %r11d,%xmm1 
>   0x00007f62db4e4f8c:   vpxor  %xmm1,%xmm2,%xmm2 
>   0x00007f62db4e4f90:   vmovd  %xmm2,%eax
> 
> you can notice that after the first vpxor, you have two (not one) other vpxor, if my assembler fu is correct, it's a xor between the vector and 0 because the reduce is done using the neutral element 0 instead of in between the values inside the AVX register.
> 
> if instead of using resudeLanes, i do the loop myself, it get the right code for reduceLane
>       var zero = (IntVector)IntVector.SPECIES_64.zero(); 
>       var v1 = zero.withLane(0, i1).withLane(1, i3); 
>       var v2 = zero.withLane(0, i2).withLane(1, i4); 
>       var result = v1.lanewise(VectorOperators.XOR, v2); 
>       var acc = result.lane(0); 
>       for(var i = 1; i < IntVector.SPECIES_64.length(); i++) {     <-- loop instead of reduceLanes
>         acc = acc ^ result.lane(i); 
>       } 
>       return acc;
> 
> I get:
>   0x00007fa378331db7:   mov    %rbp,0x10(%rsp)              ;*synchronization entry 
>                                                             ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at -1 (line 26) 
>   0x00007fa378331dbc:   mov    0x14(%rsi),%r11d 
>   0x00007fa378331dc0:   mov    0x10(%rsi),%r10d 
>   0x00007fa378331dc4:   mov    0x18(%rsi),%r9d 
>   0x00007fa378331dc8:   mov    0xc(%rsi),%r8d 
>   0x00007fa378331dcc:   movabs $0x71963e868,%rcx            ;   {oop([I{0x000000071963e868})} 
>   0x00007fa378331dd6:   vmovq  0x10(%rcx),%xmm0 
>   0x00007fa378331ddb:   vmovdqu %xmm0,%xmm1 
>   0x00007fa378331ddf:   vpinsrd $0x0,%r8d,%xmm1,%xmm1 
>   0x00007fa378331de5:   vpinsrd $0x0,%r10d,%xmm0,%xmm0 
>   0x00007fa378331deb:   vpinsrd $0x1,%r11d,%xmm1,%xmm1 
>   0x00007fa378331df1:   vpinsrd $0x1,%r9d,%xmm0,%xmm0       ;*invokestatic extract {reexecute=0 rethrow=0 return_oop=0} 
>                                                             ; - jdk.incubator.vector.Int64Vector::laneHelper at 16 (line 482) 
>                                                             ; - jdk.incubator.vector.Int64Vector::lane at 30 (line 475) 
>                                                             ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 88 (line 32) 
>   0x00007fa378331df7:   vpxor  %xmm0,%xmm1,%xmm0            ;*invokestatic binaryOp {reexecute=0 rethrow=0 return_oop=0} 
>                                                             ; - jdk.incubator.vector.IntVector::lanewiseTemplate at 244 (line 652) 
>                                                             ; - jdk.incubator.vector.Int64Vector::lanewise at 3 (line 277) 
>                                                             ; - jdk.incubator.vector.Int64Vector::lanewise at 3 (line 41) 
>                                                             ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 53 (line 29) 
>   0x00007fa378331dfb:   vpextrd $0x1,%xmm0,%eax 
>   0x00007fa378331e01:   vmovd  %xmm0,%r10d 
>   0x00007fa378331e06:   xor    %r10d,%eax
> 
> so for reduceLanes if think it's better to
> - looping using xor instead of vpxor
> - not use the neutral element 0
> 
> regards,
> Rémi
> 
> 
> De: "Paul Sandoz" <paul.sandoz at oracle.com>
> À: "Remi Forax" <forax at univ-mlv.fr>
> Cc: "panama-dev at openjdk.java.net'" <panama-dev at openjdk.java.net>
> Envoyé: Lundi 11 Mai 2020 20:02:14
> Objet: Re: IntVector.fromValues is not optimized away ?
> Hi Remi,
> 
> For some reason this method does not defer to the fromArray equivalent.
> 
> Can you try with the following patch?
> 
>   http://cr.openjdk.java.net/~psandoz/panama/vector-from-values-using-from-array/webrev/ <http://cr.openjdk.java.net/~psandoz/panama/vector-from-values-using-from-array/webrev/>
> 
> I shall also investigate further.
> 
> Paul.
> 
> On May 9, 2020, at 11:52 AM, Remi Forax <forax at univ-mlv.fr <mailto:forax at univ-mlv.fr>> wrote:
> 
> Hi all,
> this may be obvious but do we agree that IntVector.fromValues is not optimized thus really create an array destroying any hope of perf ?
> 
> I'm trying to see the difference between
> 
>    public int hashCode() {
>      return i1 ^ i2 ^ i3 ^ i4;
>    }
> 
> and
> 
>    public int hashCode() {
>      var v1 = IntVector.fromValues(IntVector.SPECIES_64, i1, i3);
>      var v2 = IntVector.fromValues(IntVector.SPECIES_64, i2, i4);
>      var result = v1.lanewise(VectorOperators.XOR, v2);
>      return result.lane(0) ^ result.lane(1);
>    }
> 
> but taking a look to the generated assembly (below), the allocation of the two arrays are still there,
> too bad because the last 6 instructions are more or less what i was expecting.
> 
> 
>  0x00007fbb383324dc:   mov    0x14(%rsi),%r11d             ;*getfield i3 {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 16 (line 14)
>  0x00007fbb383324e0:   mov    0xc(%rsi),%ebp
>  0x00007fbb383324e3:   mov    0x120(%r15),%r8
>  0x00007fbb383324ea:   mov    %r8,%r10
>  0x00007fbb383324ed:   add    $0x18,%r10
>  0x00007fbb383324f1:   cmp    0x130(%r15),%r10
>  0x00007fbb383324f8:   jae    0x00007fbb383325db
>  0x00007fbb383324fe:   mov    %r10,0x120(%r15)
>  0x00007fbb38332505:   prefetchw 0xc0(%r10)
>  0x00007fbb3833250d:   movq   $0x1,(%r8)
>  0x00007fbb38332514:   prefetchw 0x100(%r10)
>  0x00007fbb3833251c:   movl   $0x70cb1,0x8(%r8)            ;   {metadata({type array int})}
>  0x00007fbb38332524:   prefetchw 0x140(%r10)
>  0x00007fbb3833252c:   movl   $0x2,0xc(%r8)
>  0x00007fbb38332534:   prefetchw 0x180(%r10)
>  0x00007fbb3833253c:   mov    %ebp,0x10(%r8)
>  0x00007fbb38332540:   mov    %r11d,0x14(%r8)              ;*newarray {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - java.util.Arrays::copyOf at 1 (line 3584)
>                                                            ; - jdk.incubator.vector.IntVector::fromValues at 19 (line 553)
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 20 (line 14)
>  0x00007fbb38332544:   mov    0x18(%rsi),%r9d
>  0x00007fbb38332548:   mov    0x120(%r15),%rax             ;*invokestatic extract {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - jdk.incubator.vector.Int64Vector::laneHelper at 16 (line 482)
>                                                            ; - jdk.incubator.vector.Int64Vector::lane at 36 (line 476)
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 64 (line 17)
>  0x00007fbb3833254f:   mov    0x10(%rsi),%ebp              ;*getfield i2 {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 33 (line 15)
>  0x00007fbb38332552:   mov    %rax,%r10
>  0x00007fbb38332555:   add    $0x18,%r10
>  0x00007fbb38332559:   nopl   0x0(%rax)
>  0x00007fbb38332560:   cmp    0x130(%r15),%r10
>  0x00007fbb38332567:   jae    0x00007fbb3833260d
>  0x00007fbb3833256d:   mov    %r10,0x120(%r15)
>  0x00007fbb38332574:   prefetchw 0xc0(%r10)
>  0x00007fbb3833257c:   movq   $0x1,(%rax)
>  0x00007fbb38332583:   prefetchw 0x100(%r10)
>  0x00007fbb3833258b:   movl   $0x70cb1,0x8(%rax)           ;   {metadata({type array int})}
>  0x00007fbb38332592:   prefetchw 0x140(%r10)
>  0x00007fbb3833259a:   movl   $0x2,0xc(%rax)
>  0x00007fbb383325a1:   prefetchw 0x180(%r10)
>  0x00007fbb383325a9:   mov    %ebp,0x10(%rax)
>  0x00007fbb383325ac:   mov    %r9d,0x14(%rax)              ;*newarray {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - java.util.Arrays::copyOf at 1 (line 3584)
>                                                            ; - jdk.incubator.vector.IntVector::fromValues at 19 (line 553)
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 44 (line 15)
>  0x00007fbb383325b0:   vmovq  0x10(%rax),%xmm0             ;*invokestatic extract {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - jdk.incubator.vector.Int64Vector::laneHelper at 16 (line 482)
>                                                            ; - jdk.incubator.vector.Int64Vector::lane at 36 (line 476)
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 64 (line 17)
>  0x00007fbb383325b5:   vpxor  0x10(%r8),%xmm0,%xmm0        ;*invokespecial <init> {reexecute=0 rethrow=0 return_oop=0}
>                                                            ; - jdk.internal.vm.vector.VectorSupport$Vector::<init>@2 (line 104)
>                                                            ; - jdk.incubator.vector.Vector::<init>@2 (line 1122)
>                                                            ; - jdk.incubator.vector.AbstractVector::<init>@2 (line 67)
>                                                            ; - jdk.incubator.vector.IntVector::<init>@2 (line 55)
>                                                            ; - jdk.incubator.vector.Int64Vector::<init>@2 (line 58)
>                                                            ; - jdk.incubator.vector.Int64Vector::vectorFactory at 5 (line 169)
>                                                            ; - jdk.incubator.vector.Int64Vector::vectorFactory at 2 (line 41)
>                                                            ; - jdk.incubator.vector.IntVector$IntSpecies::vectorFactory at 5 (line 3718)
>                                                            ; - jdk.incubator.vector.IntVector::fromValues at 22 (line 553)
>                                                            ; - fr.umlv.vector.VectorizedHashCode$Data::hashCode2 at 44 (line 15)
>  0x00007fbb383325bb:   vpextrd $0x1,%xmm0,%r11d
>  0x00007fbb383325c1:   vmovd  %xmm0,%eax
>  0x00007fbb383325c5:   xor    %r11d,%eax
>  0x00007fbb383325c8:   vzeroupper 
> 
> regards,
> Rémi
> 
>