Trying out Vector API with ShiftOr algorithm
Dmitriy Olshanskiy
d.olshanskiy at tinkoff.ru
Tue Mar 14 09:41:01 UTC 2023
Just wanted to share our early experience with incubator Vector API.
Our team been pondering if we could use vectorization in our search
engine for a while. The key primitive is based on ShiftOR algorithm (
https://www-igm.univ-mlv.fr/~lecroq/string/node6.html) with a few
extensions. In brief with a few bit manipulations + lookup table access
it can match any substring of length of up to 64 bytes case-
insensitively. While vectorizing the main loop is tricky it would seem
that we could use 256 bit vectors to match up to 4 strings in parallel.
For our algorithm the strightforward rewrite to Vector API produced
code that was about x10 times slower for the case of a single string.
To simplify analisys I decided to prepare a pure ShiftOR implementation
in the hope that it would be easier to pinpoint any problems with
implementation or vector API codegen itself.
For the case of a single string I would have expected roughly the same
performance. Yes, the lookup table with masks gets 4x bigger but it
should still fit in L1 cache easily (8kb), plus we really only using
the lower half (ASCII). In reality I see the following in my JMH
benchmark:
Benchmark Mode Cnt Score Error Units
ShiftOrBench.scalar thrpt 5 37906697,202 ± 4141514,236 ops/s
ShiftOrBench.vector thrpt 5 14422386,834 ± 690796,099 ops/s
Sources and assembly listings for hottest regions are attached.
The main loop in scalar version is the following:
long state = -1L;
for (byte ch : haystack) {
state <<= 1;
state |= filter[ch];
if ((state & finishMask) == 0) return true;
}
return false;
Vector version is:
VectorSpecies<Long> species = LongVector.SPECIES_256;
LongVector state = LongVector.broadcast(species, -1L);
LongVector finish = LongVector.fromArray(species, finishMask, 0);
LongVector zero = LongVector.zero(species);
for (byte ch : haystack) {
state = state.lanewise(VectorOperators.LSHL, 1);
LongVector mask = LongVector.fromArray(species, filter, (int)ch *
4);
state = state.or(mask);
if (state.and(finish).eq(zero).anyTrue()) return true;
}
return false;
Looking at vector ASM I see certain suboptimal things such as this:
vptest %ymm4,%ymm7
setne %r11b
movzbl %r11b,%r11d
test %r11d,%r11d
jne 0x00007f202472310f
Here we have ZF flag right after vptest, then put it in r11b, extend
it, only to test it again and finally do a conditional jump. There is
more, but I cannot as easily point my finger at something in
particular.
So the question is mostly - are my expectations wrong or is it just a
work in progress for now?
--
Dmitry Olshansky
-------------- next part --------------
A non-text attachment was scrubbed...
Name: shiftor.zip
Type: application/zip
Size: 2741 bytes
Desc: shiftor.zip
URL: <https://mail.openjdk.org/pipermail/panama-dev/attachments/20230314/76633a47/shiftor-0001.zip>
-------------- next part --------------
scalar_thrpt_jmhStub, version 4, compile id 603
mov 0x60(%rsp),%rdx
xchg %ax,%ax
movzbl 0x94(%rdx),%r11d
test %r11d,%r11d
jne 0x00007f8e38706654
mov $0x1,%ebx
jmp 0x00007f8e3870642e
mov $0x1,%r11d
vmovq %xmm2,%rdx
movzbl 0x94(%rdx),%r11d
mov 0x380(%r15),%r10
vmovq %xmm3,%rbx
add $0x1,%rbx
test %eax,(%r10)
xchg %ax,%ax
test %r11d,%r11d
jne 0x00007f8e38706659
vmovq %xmm1,%rdi
vmovq %xmm0,%r9
mov 0xc(%r9),%r11d
mov 0x14(%r9),%r8d
nopl 0x0(%rax,%rax,1)
mov 0x8(%r12,%r8,8),%ecx
cmp $0x102f3e0,%ecx
jne 0x00007f8e387066df
mov 0xc(%r12,%r11,8),%ecx
lea (%r12,%r8,8),%rbp
test %ecx,%ecx
nopl 0x0(%rax)
jbe 0x00007f8e38706790
mov %ecx,%r10d
dec %r10d
cmp %ecx,%r10d
jae 0x00007f8e3870669d
vmovq %rdx,%xmm2
vmovq %rdi,%xmm1
mov 0xc(%rbp),%r14d
mov 0xc(%r12,%r14,8),%r9d
movsbl 0x10(%r12,%r11,8),%r8d
cmp %r9d,%r8d
jae 0x00007f8e387066d0
vmovd %r11d,%xmm4
vmovq %rbx,%xmm3
mov 0x10(%rbp),%rbx
lea (%r12,%r14,8),%rdx
mov $0xfffffffffffffffe,%rdi
or 0x10(%rdx,%r8,8),%rdi
mov %rdi,%r11
and %rbx,%r11
nopl 0x0(%rax,%rax,1)
test %r11,%r11
je 0x00007f8e387063f8
mov %ecx,%esi
add $0xfffffffd,%esi
cmp %esi,%r10d
mov $0x80000000,%r11d
cmovl %r11d,%esi
vmovd %xmm4,%r10d
lea (%r12,%r10,8),%rax
cmp $0x1,%esi
jle 0x00007f8e38706710
shl %rdi
mov $0x1,%r10d
mov %esi,%r11d
sub %r10d,%r11d
xor %r8d,%r8d
cmp %r10d,%esi
cmovl %r8d,%r11d
cmp $0xfa0,%r11d
mov $0xfa0,%r8d
cmova %r8d,%r11d
add %r10d,%r11d
jmp 0x00007f8e38706523
nopl 0x0(%rax)
mov %r8,%rdi
movslq %r10d,%r13
movsbl 0x10(%rax,%r13,1),%r8d
cmp %r9d,%r8d
jae 0x00007f8e38706601
or 0x10(%rdx,%r8,8),%rdi
mov %rdi,%r8
and %rbx,%r8
test %r8,%r8
je 0x00007f8e387063f8
movsbl 0x11(%rax,%r13,1),%r8d
shl %rdi
cmp %r9d,%r8d
jae 0x00007f8e387065f8
or 0x10(%rdx,%r8,8),%rdi
mov %rdi,%r8
and %rbx,%r8
test %r8,%r8
je 0x00007f8e387063f8
movsbl 0x12(%rax,%r13,1),%r8d
shl %rdi
cmp %r9d,%r8d
nopl 0x0(%rax,%rax,1)
jae 0x00007f8e387065fd
or 0x10(%rdx,%r8,8),%rdi
mov %rdi,%r8
and %rbx,%r8
test %r8,%r8
je 0x00007f8e387063f8
movsbl 0x13(%rax,%r13,1),%r8d
shl %rdi
cmp %r9d,%r8d
jae 0x00007f8e387065f4
or 0x10(%rdx,%r8,8),%rdi
mov %rdi,%r8
and %rbx,%r8
test %r8,%r8
nopw 0x0(%rax,%rax,1)
je 0x00007f8e387063f8
mov %rdi,%r8
shl %r8
add $0x4,%r10d
cmp %r11d,%r10d
jl 0x00007f8e38706520
mov 0x380(%r15),%r11
-------------- next part --------------
vector_thrpt_jmhStub, version 4, compile id 832
vzeroupper
add $0xc0,%rsp
pop %rbp
cmp 0x378(%r15),%rsp
ja 0x00007f2024723568
retq
vmovq %xmm0,%rbx
vmovq %xmm1,%r11
vmovq %xmm2,%r8
vmovq %xmm3,%rdi
xor %r9d,%r9d
movzbl 0x94(%rdi),%r10d
mov 0x380(%r15),%r9
add $0x1,%r14
test %eax,(%r9)
test %r10d,%r10d
jne 0x00007f2024722d57
mov 0xc(%r11),%r13d
mov 0x18(%r11),%r10d
mov 0x8(%r12,%r10,8),%r9d
nopw 0x0(%rax,%rax,1)
cmp $0x102bab0,%r9d
jne 0x00007f2024723298
lea (%r12,%r10,8),%rdx
mov 0x10(%rdx),%r9d
mov 0xc(%r12,%r9,8),%ecx
mov %ecx,%r10d
add $0xfffffffd,%r10d
test %r10d,%r10d
jl 0x00007f20247232cc
cmp $0x3,%ecx
je 0x00007f2024723254
mov 0xc(%r12,%r13,8),%ecx
test %ecx,%ecx
nopw 0x0(%rax,%rax,1)
jbe 0x00007f2024722daf
vmovdqu 0x10(%r12,%r9,8),%ymm8
mov %ecx,%r10d
dec %r10d
cmp %ecx,%r10d
jae 0x00007f2024723310
mov 0xc(%rdx),%r9d
nopl 0x0(%rax)
mov 0xc(%r12,%r9,8),%eax
add $0xfffffffd,%eax
test %eax,%eax
jl 0x00007f2024723310
vmovq %rdi,%xmm3
vmovq %r8,%xmm2
vmovq %r11,%xmm1
vmovq %rbx,%xmm0
movsbl 0x10(%r12,%r13,8),%r11d
shl $0x2,%r11d
cmp %eax,%r11d
jae 0x00007f202472335c
lea (%r12,%r9,8),%rdi
vpor 0x10(%rdi,%r11,8),%ymm5,%ymm9
vpand %ymm8,%ymm9,%ymm7
vpcmpeqq %ymm6,%ymm7,%ymm7
vptest %ymm4,%ymm7
setne %r11b
movzbl %r11b,%r11d
test %r11d,%r11d
nopl 0x0(%rax)
jne 0x00007f202472337e
mov %ecx,%esi
add $0xfffffffd,%esi
cmp %esi,%r10d
mov $0x80000000,%r11d
cmovl %r11d,%esi
lea (%r12,%r13,8),%rbx
nopl 0x0(%rax)
cmp $0x1,%esi
jle 0x00007f2024723368
mov $0x1,%r8d
mov %esi,%ebp
sub %r8d,%ebp
xor %r10d,%r10d
cmp %r8d,%esi
cmovl %r10d,%ebp
cmp $0xfa0,%ebp
mov $0xfa0,%r10d
cmova %r10d,%ebp
add %r8d,%ebp
vpsllq $0x1,%ymm9,%ymm7
movslq %r8d,%r10
movsbl 0x10(%rbx,%r10,1),%r11d
shl $0x2,%r11d
cmp %eax,%r11d
jae 0x00007f2024723099
vpor 0x10(%rdi,%r11,8),%ymm7,%ymm9
vpand %ymm8,%ymm9,%ymm7
vpcmpeqq %ymm6,%ymm7,%ymm7
vptest %ymm4,%ymm7
setne %r11b
movzbl %r11b,%r11d
test %r11d,%r11d
jne 0x00007f202472310f
movsbl 0x11(%rbx,%r10,1),%r11d
vpsllq $0x1,%ymm9,%ymm7
shl $0x2,%r11d
cmp %eax,%r11d
jae 0x00007f202472308a
vpor 0x10(%rdi,%r11,8),%ymm7,%ymm9
vpand %ymm8,%ymm9,%ymm7
vpcmpeqq %ymm6,%ymm7,%ymm7
vptest %ymm4,%ymm7
setne %r11b
movzbl %r11b,%r11d
test %r11d,%r11d
jne 0x00007f2024723100
movsbl 0x12(%rbx,%r10,1),%r11d
vpsllq $0x1,%ymm9,%ymm7
shl $0x2,%r11d
cmp %eax,%r11d
jae 0x00007f202472308f
vpor 0x10(%rdi,%r11,8),%ymm7,%ymm9
vpand %ymm8,%ymm9,%ymm7
vpcmpeqq %ymm6,%ymm7,%ymm7
vptest %ymm4,%ymm7
setne %r11b
movzbl %r11b,%r11d
test %r11d,%r11d
jne 0x00007f2024723105
movsbl 0x13(%rbx,%r10,1),%r11d
vpsllq $0x1,%ymm9,%ymm7
shl $0x2,%r11d
cmp %eax,%r11d
jae 0x00007f2024723095
vpor 0x10(%rdi,%r11,8),%ymm7,%ymm9
vpand %ymm8,%ymm9,%ymm7
vpcmpeqq %ymm6,%ymm7,%ymm7
vptest %ymm4,%ymm7
setne %r11b
movzbl %r11b,%r11d
test %r11d,%r11d
jne 0x00007f202472310b
add $0x4,%r8d
cmp %ebp,%r8d
nopl 0x0(%rax,%rax,1)
jl 0x00007f2024722ef1
mov 0x380(%r15),%r10
test %eax,(%r10)
cmp %esi,%r8d
jl 0x00007f2024722ecf
cmp %ecx,%r8d
nopl 0x0(%rax)
jge 0x00007f2024722d9b
movsbl 0x10(%rbx,%r8,1),%r11d
vpsllq $0x1,%ymm9,%ymm7
shl $0x2,%r11d
cmp %eax,%r11d
jae 0x00007f2024723373
More information about the panama-dev
mailing list