Trying out Vector API with ShiftOr algorithm

Tue Mar 14 09:41:01 UTC 2023

Just wanted to share our early experience with incubator Vector API.

Our team been pondering if we could use vectorization in our search
engine for a while. The key primitive is based on ShiftOR algorithm (
https://www-igm.univ-mlv.fr/~lecroq/string/node6.html) with a few
extensions. In brief with a few bit manipulations + lookup table access
it can match any substring of length of up to 64 bytes case-
insensitively. While vectorizing the main loop is tricky it would seem
that we could use 256 bit vectors to match up to 4 strings in parallel.

For our algorithm the strightforward rewrite to Vector API produced
code that was about x10 times slower for the case of a single string.
To simplify analisys I decided to prepare a pure ShiftOR implementation
in the hope that it would be easier to pinpoint any problems with
implementation or vector API codegen itself. 

For the case of a single string I would have expected roughly the same
performance. Yes, the lookup table with masks gets 4x bigger but it
should still fit in L1 cache easily (8kb), plus we really only using
the lower half (ASCII). In reality I see the following in my JMH
benchmark:

Benchmark                  Mode  Cnt         Score         Error  Units
ShiftOrBench.scalar       thrpt    5  37906697,202 ± 4141514,236  ops/s
ShiftOrBench.vector       thrpt    5  14422386,834 ±  690796,099  ops/s

Sources and assembly listings for hottest regions are attached.

The main loop in scalar version is the following:

long state = -1L;
for (byte ch : haystack) {
    state <<= 1;
    state |= filter[ch];
    if ((state & finishMask) == 0) return true;
}
return false;

Vector version is:

VectorSpecies<Long> species = LongVector.SPECIES_256;
LongVector state = LongVector.broadcast(species, -1L);
LongVector finish = LongVector.fromArray(species, finishMask, 0);
LongVector zero = LongVector.zero(species);
for (byte ch : haystack) {
    state = state.lanewise(VectorOperators.LSHL, 1);
    LongVector mask = LongVector.fromArray(species, filter, (int)ch *
4);
    state = state.or(mask);
    if (state.and(finish).eq(zero).anyTrue()) return true;
}
return false;

Looking at vector ASM I see certain suboptimal things such as this:
   vptest %ymm4,%ymm7
   setne  %r11b
   movzbl %r11b,%r11d
   test   %r11d,%r11d
   jne    0x00007f202472310f

Here we have ZF flag right after vptest, then put it in r11b, extend
it, only to test it again and finally do a conditional jump. There is
more, but I cannot as easily point my finger at something in
particular.

So the question is mostly - are my expectations wrong or is it just a
work in progress for now?

--
Dmitry Olshansky
-------------- next part --------------
A non-text attachment was scrubbed...
Name: shiftor.zip
Type: application/zip
Size: 2741 bytes
Desc: shiftor.zip
URL: <https://mail.openjdk.org/pipermail/panama-dev/attachments/20230314/76633a47/shiftor-0001.zip>
-------------- next part --------------
scalar_thrpt_jmhStub, version 4, compile id 603 

   mov    0x60(%rsp),%rdx
   xchg   %ax,%ax
   movzbl 0x94(%rdx),%r11d             
   test   %r11d,%r11d
   jne    0x00007f8e38706654           
   mov    $0x1,%ebx
   jmp    0x00007f8e3870642e
   mov    $0x1,%r11d                   
   vmovq  %xmm2,%rdx
   movzbl 0x94(%rdx),%r11d             
   mov    0x380(%r15),%r10
   vmovq  %xmm3,%rbx
   add    $0x1,%rbx                    
   test   %eax,(%r10)                  
   xchg   %ax,%ax
   test   %r11d,%r11d
   jne    0x00007f8e38706659           
   vmovq  %xmm1,%rdi                   
   vmovq  %xmm0,%r9
   mov    0xc(%r9),%r11d               
   mov    0x14(%r9),%r8d               
   nopl   0x0(%rax,%rax,1)
   mov    0x8(%r12,%r8,8),%ecx         
   cmp    $0x102f3e0,%ecx              
   jne    0x00007f8e387066df           
   mov    0xc(%r12,%r11,8),%ecx        
   lea    (%r12,%r8,8),%rbp            
   test   %ecx,%ecx
   nopl   0x0(%rax)
   jbe    0x00007f8e38706790           
   mov    %ecx,%r10d
   dec    %r10d
   cmp    %ecx,%r10d
   jae    0x00007f8e3870669d
   vmovq  %rdx,%xmm2
   vmovq  %rdi,%xmm1
   mov    0xc(%rbp),%r14d              
   mov    0xc(%r12,%r14,8),%r9d        
   movsbl 0x10(%r12,%r11,8),%r8d       
   cmp    %r9d,%r8d
   jae    0x00007f8e387066d0
   vmovd  %r11d,%xmm4
   vmovq  %rbx,%xmm3
   mov    0x10(%rbp),%rbx              
   lea    (%r12,%r14,8),%rdx
   mov    $0xfffffffffffffffe,%rdi
   or     0x10(%rdx,%r8,8),%rdi        
   mov    %rdi,%r11
   and    %rbx,%r11
   nopl   0x0(%rax,%rax,1)
   test   %r11,%r11
   je     0x00007f8e387063f8           
   mov    %ecx,%esi
   add    $0xfffffffd,%esi
   cmp    %esi,%r10d
   mov    $0x80000000,%r11d
   cmovl  %r11d,%esi
   vmovd  %xmm4,%r10d
   lea    (%r12,%r10,8),%rax           
   cmp    $0x1,%esi
   jle    0x00007f8e38706710
   shl    %rdi
   mov    $0x1,%r10d
   mov    %esi,%r11d
   sub    %r10d,%r11d
   xor    %r8d,%r8d
   cmp    %r10d,%esi
   cmovl  %r8d,%r11d
   cmp    $0xfa0,%r11d
   mov    $0xfa0,%r8d
   cmova  %r8d,%r11d
   add    %r10d,%r11d
   jmp    0x00007f8e38706523
   nopl   0x0(%rax)
   mov    %r8,%rdi                     
   movslq %r10d,%r13
   movsbl 0x10(%rax,%r13,1),%r8d       
   cmp    %r9d,%r8d
   jae    0x00007f8e38706601
   or     0x10(%rdx,%r8,8),%rdi        
   mov    %rdi,%r8
   and    %rbx,%r8
   test   %r8,%r8
   je     0x00007f8e387063f8           
   movsbl 0x11(%rax,%r13,1),%r8d       
   shl    %rdi                         
   cmp    %r9d,%r8d
   jae    0x00007f8e387065f8
   or     0x10(%rdx,%r8,8),%rdi        
   mov    %rdi,%r8
   and    %rbx,%r8
   test   %r8,%r8
   je     0x00007f8e387063f8           
   movsbl 0x12(%rax,%r13,1),%r8d       
   shl    %rdi                         
   cmp    %r9d,%r8d
   nopl   0x0(%rax,%rax,1)
   jae    0x00007f8e387065fd
   or     0x10(%rdx,%r8,8),%rdi        
   mov    %rdi,%r8
   and    %rbx,%r8
   test   %r8,%r8
   je     0x00007f8e387063f8           
   movsbl 0x13(%rax,%r13,1),%r8d       
   shl    %rdi                         
   cmp    %r9d,%r8d
   jae    0x00007f8e387065f4
   or     0x10(%rdx,%r8,8),%rdi        
   mov    %rdi,%r8
   and    %rbx,%r8
   test   %r8,%r8
   nopw   0x0(%rax,%rax,1)
   je     0x00007f8e387063f8           
   mov    %rdi,%r8
   shl    %r8                          
   add    $0x4,%r10d                   
   cmp    %r11d,%r10d
   jl     0x00007f8e38706520           
   mov    0x380(%r15),%r11             
-------------- next part --------------
vector_thrpt_jmhStub, version 4, compile id 832 

   vzeroupper 
   add    $0xc0,%rsp
   pop    %rbp
   cmp    0x378(%r15),%rsp             
   ja     0x00007f2024723568           
   retq   
   vmovq  %xmm0,%rbx
   vmovq  %xmm1,%r11
   vmovq  %xmm2,%r8
   vmovq  %xmm3,%rdi
   xor    %r9d,%r9d                    
   movzbl 0x94(%rdi),%r10d             
   mov    0x380(%r15),%r9
   add    $0x1,%r14                    
   test   %eax,(%r9)                   
   test   %r10d,%r10d
   jne    0x00007f2024722d57           
   mov    0xc(%r11),%r13d              
   mov    0x18(%r11),%r10d             
   mov    0x8(%r12,%r10,8),%r9d        
   nopw   0x0(%rax,%rax,1)
   cmp    $0x102bab0,%r9d              
   jne    0x00007f2024723298
   lea    (%r12,%r10,8),%rdx           
   mov    0x10(%rdx),%r9d              
   mov    0xc(%r12,%r9,8),%ecx         
   mov    %ecx,%r10d
   add    $0xfffffffd,%r10d
   test   %r10d,%r10d
   jl     0x00007f20247232cc
   cmp    $0x3,%ecx
   je     0x00007f2024723254
   mov    0xc(%r12,%r13,8),%ecx        
   test   %ecx,%ecx
   nopw   0x0(%rax,%rax,1)
   jbe    0x00007f2024722daf           
   vmovdqu 0x10(%r12,%r9,8),%ymm8      
   mov    %ecx,%r10d
   dec    %r10d
   cmp    %ecx,%r10d
   jae    0x00007f2024723310           
   mov    0xc(%rdx),%r9d               
   nopl   0x0(%rax)
   mov    0xc(%r12,%r9,8),%eax         
   add    $0xfffffffd,%eax
   test   %eax,%eax
   jl     0x00007f2024723310
   vmovq  %rdi,%xmm3
   vmovq  %r8,%xmm2
   vmovq  %r11,%xmm1
   vmovq  %rbx,%xmm0
   movsbl 0x10(%r12,%r13,8),%r11d
   shl    $0x2,%r11d
   cmp    %eax,%r11d
   jae    0x00007f202472335c
   lea    (%r12,%r9,8),%rdi            
   vpor   0x10(%rdi,%r11,8),%ymm5,%ymm9
   vpand  %ymm8,%ymm9,%ymm7
   vpcmpeqq %ymm6,%ymm7,%ymm7
   vptest %ymm4,%ymm7
   setne  %r11b
   movzbl %r11b,%r11d
   test   %r11d,%r11d
   nopl   0x0(%rax)
   jne    0x00007f202472337e
   mov    %ecx,%esi
   add    $0xfffffffd,%esi
   cmp    %esi,%r10d
   mov    $0x80000000,%r11d
   cmovl  %r11d,%esi
   lea    (%r12,%r13,8),%rbx
   nopl   0x0(%rax)
   cmp    $0x1,%esi                    
   jle    0x00007f2024723368
   mov    $0x1,%r8d                    
   mov    %esi,%ebp
   sub    %r8d,%ebp
   xor    %r10d,%r10d
   cmp    %r8d,%esi
   cmovl  %r10d,%ebp
   cmp    $0xfa0,%ebp
   mov    $0xfa0,%r10d
   cmova  %r10d,%ebp
   add    %r8d,%ebp                    
   vpsllq $0x1,%ymm9,%ymm7             
   movslq %r8d,%r10                    
   movsbl 0x10(%rbx,%r10,1),%r11d
   shl    $0x2,%r11d
   cmp    %eax,%r11d
   jae    0x00007f2024723099           
   vpor   0x10(%rdi,%r11,8),%ymm7,%ymm9
   vpand  %ymm8,%ymm9,%ymm7
   vpcmpeqq %ymm6,%ymm7,%ymm7
   vptest %ymm4,%ymm7
   setne  %r11b
   movzbl %r11b,%r11d
   test   %r11d,%r11d
   jne    0x00007f202472310f
   movsbl 0x11(%rbx,%r10,1),%r11d      
   vpsllq $0x1,%ymm9,%ymm7             
   shl    $0x2,%r11d
   cmp    %eax,%r11d
   jae    0x00007f202472308a           
   vpor   0x10(%rdi,%r11,8),%ymm7,%ymm9
   vpand  %ymm8,%ymm9,%ymm7
   vpcmpeqq %ymm6,%ymm7,%ymm7
   vptest %ymm4,%ymm7
   setne  %r11b
   movzbl %r11b,%r11d
   test   %r11d,%r11d
   jne    0x00007f2024723100
   movsbl 0x12(%rbx,%r10,1),%r11d      
   vpsllq $0x1,%ymm9,%ymm7             
   shl    $0x2,%r11d
   cmp    %eax,%r11d
   jae    0x00007f202472308f           
   vpor   0x10(%rdi,%r11,8),%ymm7,%ymm9
   vpand  %ymm8,%ymm9,%ymm7
   vpcmpeqq %ymm6,%ymm7,%ymm7
   vptest %ymm4,%ymm7
   setne  %r11b
   movzbl %r11b,%r11d
   test   %r11d,%r11d
   jne    0x00007f2024723105
   movsbl 0x13(%rbx,%r10,1),%r11d      
   vpsllq $0x1,%ymm9,%ymm7             
   shl    $0x2,%r11d
   cmp    %eax,%r11d
   jae    0x00007f2024723095           
   vpor   0x10(%rdi,%r11,8),%ymm7,%ymm9
   vpand  %ymm8,%ymm9,%ymm7
   vpcmpeqq %ymm6,%ymm7,%ymm7
   vptest %ymm4,%ymm7                  
   setne  %r11b
   movzbl %r11b,%r11d
   test   %r11d,%r11d
   jne    0x00007f202472310b           
   add    $0x4,%r8d
   cmp    %ebp,%r8d
   nopl   0x0(%rax,%rax,1)
   jl     0x00007f2024722ef1
   mov    0x380(%r15),%r10             
   test   %eax,(%r10)                  
   cmp    %esi,%r8d
   jl     0x00007f2024722ecf
   cmp    %ecx,%r8d
   nopl   0x0(%rax)
   jge    0x00007f2024722d9b           
   movsbl 0x10(%rbx,%r8,1),%r11d
   vpsllq $0x1,%ymm9,%ymm7
   shl    $0x2,%r11d
   cmp    %eax,%r11d
   jae    0x00007f2024723373