[aarch64-port-dev ] RFR: Add support for String.indexOf
Andrew Haley
aph at redhat.com
Tue Aug 19 09:18:47 UTC 2014
Hi,
On 18/08/14 17:27, Edward Nevill wrote:
>
> The following patch adds support for the String.indexOf intrinsic.
>
> It uses a combination of 2 algorithms, a simplified Boyer Moore, and
> a straightforward linear scan.
In what way is this algorithm simplified from standard Boyer-Moore?
How can we have confidence it is correct?
> Boyer Moore is used when the pattern length is >= 8 and the source
> length is >= 4 * pattern length. This heuristic seems to produce the
> best results from benchmarking.
>
> Essentially Boyer Moore generates the best results when the pattern
> length is short relative to the source but is sufficiently long to
> make the skip span worthwhile. As the pattern length grows towards
> the source length the initialisation of the pattern string tends to
> dominate.
>
> The code also contains special case code for 1, 2, 3, and 4 chars in
> the pattern.
>
> Benchmarking shows 1.96 X improvement on my synthetic benchmark
> which searches for patterns of length { 1, 2, 3, 4, 8, 12, 16, 24,
> 32, 64 } in a source string of length { 65, 129, 964 }. None of the
> patterns match in the benchmark but the patterns are contrived so
> that the initial substring of the pattern does match at several
> places in the source string.
>
> On the hotspot indexof test it show 1.89 X improvement.
>
> Tests as usual with jtreg/hotspot.
>
> OK to push?
Some comments inline.
I'm a bit concerned that we always inline this code. AFAICS if
icnt1 == -1 we might as well branch to a stub.
Andrew.
> --- CUT HERE ---
> # HG changeset patch
> # User Edward Nevill edward.nevill at linaro.org
> # Date 1408378229 -3600
> # Mon Aug 18 17:10:29 2014 +0100
> # Node ID 978b74cb5c7b233477d5ce9346c038fbccd80869
> # Parent 2dfe9abe27fe6c9231e64ebd66e7ca3e6fb5b2bf
> Add support for String.indexOf intrinsic
>
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/aarch64.ad
> --- a/src/cpu/aarch64/vm/aarch64.ad Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/aarch64.ad Mon Aug 18 17:10:29 2014 +0100
> @@ -3415,6 +3415,16 @@
> interface(CONST_INTER);
> %}
>
> +operand immI_le_4()
> +%{
> + predicate(n->get_int() <= 4);
> + match(ConI);
> +
> + op_cost(0);
> + format %{ %}
> + interface(CONST_INTER);
> +%}
> +
> operand immI_31()
> %{
> predicate(n->get_int() == 31);
> @@ -11733,6 +11743,44 @@
> ins_pipe(pipe_class_memory);
> %}
>
> +instruct string_indexof(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
> + iRegI_R0 result, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
> +%{
> + match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
> + effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
> + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
> + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result" %}
> +
> + ins_encode %{
> + __ string_indexof($str1$$Register, $str2$$Register,
> + $cnt1$$Register, $cnt2$$Register,
> + $tmp1$$Register, $tmp2$$Register,
> + $tmp3$$Register, $tmp4$$Register,
> + -1, $result$$Register);
> + %}
> + ins_pipe(pipe_class_memory);
> +%}
> +
> +instruct string_indexof_con(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
> + immI_le_4 int_cnt2, iRegI_R0 result, iRegI tmp1, iRegI tmp2,
> + iRegI tmp3, iRegI tmp4, rFlagsReg cr)
> +%{
> + match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
> + effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
> + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
> + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result" %}
> +
> + ins_encode %{
> + int icnt2 = (int)$int_cnt2$$constant;
> + __ string_indexof($str1$$Register, $str2$$Register,
> + $cnt1$$Register, zr,
> + $tmp1$$Register, $tmp2$$Register,
> + $tmp3$$Register, $tmp4$$Register,
> + icnt2, $result$$Register);
> + %}
> + ins_pipe(pipe_class_memory);
> +%}
> +
> instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
> iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
> %{
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/vm_version_aarch64.cpp
> --- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp Mon Aug 18 17:10:29 2014 +0100
> @@ -105,6 +105,7 @@
> FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
> FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
> FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
> + FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
>
> #ifndef BUILTIN_SIM
> unsigned long auxv = getauxval(AT_HWCAP);
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
> --- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon Aug 18 17:10:29 2014 +0100
> @@ -1,3 +1,4 @@
> +/*
> /*
> * Copyright (c) 2013, Red Hat Inc.
> * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
> @@ -3386,6 +3387,338 @@
> }
> }
>
> +// Search for str1 in str2 and return index or -1
> +void MacroAssembler::string_indexof(Register str2, Register str1,
> + Register cnt2, Register cnt1,
> + Register tmp1, Register tmp2,
> + Register tmp3, Register tmp4,
> + int icnt1, Register result) {
> + Label BM, LS, DONE, NOMATCH, MATCH;
> +
> + Register ch1 = rscratch1;
> + Register ch2 = rscratch2;
> + Register cnt1tmp = tmp1;
> + Register cnt2tmp = tmp2;
> + Register cnt1_neg = cnt1;
> + Register cnt2_neg = cnt2;
> + Register result_tmp = tmp4;
> +
> + // Note, inline_string_indexOf() generates checks:
> + // if (substr.count > string.count) return -1;
> + // if (substr.count == 0) return 0;
> +
> +// We have two strings, a source string in str2, cnt2 and a pattern string
> +// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
> +
> +// For larger pattern and source we use a simplified Boyer Moore algorithm.
> +// With a small pattern and source we use linear scan.
> +
> + if (icnt1 == -1) {
> + cmp(cnt1, 256); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
> + ccmp(cnt1, 8, 0b0000, CC); // Can't handle skip >= 256 because we use
Can this CC be LO, please. This is really important because the ARM
carry flag is inverted compared with e.g. Intel.
> + br(CC, LS); // a byte array.
> + cmp(cnt1, cnt2, LSR, 2); // Source must be 4 * pattern for BM
> + br(CS, LS);
No, you may not give a label the same name as one of the condition
codes. This is an unexploded bomb.
> + }
> +
> +// Larger pattern and source use the following Boyer Moore alogorithm.
> +//
> +// #define ASIZE 128
> +//
> +// int bm(unsigned char *x, int m, unsigned char *y, int n) {
> +// int i, j;
> +// unsigned c;
> +// unsigned char bc[ASIZE];
> +//
> +// /* Preprocessing */
> +// for (i = 0; i < ASIZE; ++i)
> +// bc[i] = 0;
> +// for (i = 0; i < m - 1; ) {
> +// c = x[i];
> +// ++i;
> +// if (c < ASIZE) bc[c] = i;
> +// }
> +//
> +// /* Searching */
> +// j = 0;
> +// while (j <= n - m) {
> +// c = y[i+j];
> +// if (x[m-1] == c)
> +// for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
> +// if (i < 0) return j;
> +// if (c < ASIZE)
> +// j = j - bc[y[j+m-1]] + m;
> +// else
> +// j += 1; // Advance by 1 only if char >= ASIZE
> +// }
> +// }
> +
> + if (icnt1 == -1) {
> + BIND(BM);
> +
> + Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
> + Label BMADV, BMMATCH, BMCHECKEND;
> +
> + Register cnt1end = tmp2;
> + Register str2end = cnt2;
> + Register skipch = tmp2;
> +
> + // Restrict ASIZE to 128 to reduce stack space/initialisation.
> + // The presence of chars >= ASIZE in the target string does not affect
> + // performance, but we must be careful not to initialise them in the stack
> + // array.
> + // The presence of chars >= ASIZE in the source string may adversely affect
> + // performance since we can only advance by one when we encounter one.
> +
> + stp(zr, zr, pre(sp, -128));
> + stp(zr, zr, Address(sp, 1*16));
> + stp(zr, zr, Address(sp, 2*16));
> + stp(zr, zr, Address(sp, 3*16));
> + stp(zr, zr, Address(sp, 4*16));
> + stp(zr, zr, Address(sp, 5*16));
> + stp(zr, zr, Address(sp, 6*16));
> + stp(zr, zr, Address(sp, 7*16));
Why is this not a loop?
> +
> + mov(cnt1tmp, 0);
> + sub(cnt1end, cnt1, 1);
> + BIND(BCLOOP);
> + ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> + cmp(ch1, 128);
> + add(cnt1tmp, cnt1tmp, 1);
> + br(HS, BCSKIP);
> + strb(cnt1tmp, Address(sp, ch1));
> + BIND(BCSKIP);
> + cmp(cnt1tmp, cnt1end);
> + br(LT, BCLOOP);
> +
> + mov(result_tmp, str2);
> +
> + sub(cnt2, cnt2, cnt1);
> + add(str2end, str2, cnt2, LSL, 1);
> + BIND(BMLOOPSTR2);
> + sub(cnt1tmp, cnt1, 1);
> + ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> + ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
> + cmp(ch1, skipch);
> + br(NE, BMSKIP);
> + subs(cnt1tmp, cnt1tmp, 1);
> + br(LT, BMMATCH);
> + BIND(BMLOOPSTR1);
> + ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> + ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
> + cmp(ch1, ch2);
> + br(NE, BMSKIP);
> + subs(cnt1tmp, cnt1tmp, 1);
> + br(GE, BMLOOPSTR1);
> + BIND(BMMATCH);
> + sub(result_tmp, str2, result_tmp);
> + lsr(result, result_tmp, 1);
> + add(sp, sp, 128);
> + b(DONE);
> + BIND(BMADV);
> + add(str2, str2, 2);
> + b(BMCHECKEND);
> + BIND(BMSKIP);
> + cmp(skipch, 128);
> + br(HS, BMADV);
> + ldrb(ch2, Address(sp, skipch));
> + add(str2, str2, cnt1, LSL, 1);
> + sub(str2, str2, ch2, LSL, 1);
> + BIND(BMCHECKEND);
> + cmp(str2, str2end);
> + br(LE, BMLOOPSTR2);
> + add(sp, sp, 128);
> + b(NOMATCH);
> + }
> +
> + BIND(LS);
> + {
> + Label DO1, DO2, DO3;
> +
> + Register str2tmp = tmp2;
> + Register first = tmp3;
> +
> + if (icnt1 == -1)
> + {
> + Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
> +
> + cmp(cnt1, 4);
> + br(LT, DOSHORT);
> +
> + sub(cnt2, cnt2, cnt1);
> + sub(cnt1, cnt1, 4);
> + mov(result_tmp, cnt2);
> +
> + lea(str1, Address(str1, cnt1, Address::uxtw(1)));
> + lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> + sub(cnt1_neg, zr, cnt1, LSL, 1);
> + sub(cnt2_neg, zr, cnt2, LSL, 1);
> + ldr(first, Address(str1, cnt1_neg));
> +
> + BIND(FIRST_LOOP);
> + ldr(ch2, Address(str2, cnt2_neg));
> + cmp(first, ch2);
> + br(EQ, STR1_LOOP);
> + BIND(STR2_NEXT);
> + adds(cnt2_neg, cnt2_neg, 2);
> + br(LE, FIRST_LOOP);
> + b(NOMATCH);
> +
> + BIND(STR1_LOOP);
> + adds(cnt1tmp, cnt1_neg, 8);
> + add(cnt2tmp, cnt2_neg, 8);
> + br(GE, LAST_WORD);
> +
> + BIND(STR1_NEXT);
> + ldr(ch1, Address(str1, cnt1tmp));
> + ldr(ch2, Address(str2, cnt2tmp));
> + cmp(ch1, ch2);
> + br(NE, STR2_NEXT);
> + adds(cnt1tmp, cnt1tmp, 8);
> + add(cnt2tmp, cnt2tmp, 8);
> + br(LT, STR1_NEXT);
> +
> + BIND(LAST_WORD);
> + ldr(ch1, Address(str1));
> + sub(str2tmp, str2, cnt1_neg); // adjust to corresponding
> + ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
> + cmp(ch1, ch2);
> + br(NE, STR2_NEXT);
> + b(MATCH);
> +
> + BIND(DOSHORT);
> + cmp(cnt1, 2);
> + br(LT, DO1);
> + br(GT, DO3);
> + }
> +
> + if (icnt1 == 4) {
> + Label CH1_LOOP;
> +
> + ldr(ch1, str1);
> + sub(cnt2, cnt2, 4);
> + mov(result_tmp, cnt2);
> + lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> + sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> + BIND(CH1_LOOP);
> + ldr(ch2, Address(str2, cnt2_neg));
> + cmp(ch1, ch2);
> + br(EQ, MATCH);
> + adds(cnt2_neg, cnt2_neg, 2);
> + br(LE, CH1_LOOP);
> + b(NOMATCH);
> + }
> +
> + if (icnt1 == -1 || icnt1 == 2) {
> + Label CH1_LOOP;
> +
> + BIND(DO2);
> + ldrw(ch1, str1);
> + sub(cnt2, cnt2, 2);
> + mov(result_tmp, cnt2);
> + lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> + sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> + BIND(CH1_LOOP);
> + ldrw(ch2, Address(str2, cnt2_neg));
> + cmp(ch1, ch2);
> + br(EQ, MATCH);
> + adds(cnt2_neg, cnt2_neg, 2);
> + br(LE, CH1_LOOP);
> + b(NOMATCH);
> + }
> +
> + if (icnt1 == -1 || icnt1 == 3) {
> + Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
> +
> + BIND(DO3);
> + ldrw(first, str1);
> + ldrh(ch1, Address(str1, 4));
> +
> + sub(cnt2, cnt2, 3);
> + mov(result_tmp, cnt2);
> + lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> + sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> + BIND(FIRST_LOOP);
> + ldrw(ch2, Address(str2, cnt2_neg));
> + cmpw(first, ch2);
> + br(EQ, STR1_LOOP);
> + BIND(STR2_NEXT);
> + adds(cnt2_neg, cnt2_neg, 2);
> + br(LE, FIRST_LOOP);
> + b(NOMATCH);
> +
> + BIND(STR1_LOOP);
> + add(cnt2tmp, cnt2_neg, 4);
> + ldrh(ch2, Address(str2, cnt2tmp));
> + cmp(ch1, ch2);
> + br(NE, STR2_NEXT);
> + add(result, result_tmp, cnt2_neg, ASR, 1);
> + b(DONE);
> + }
> +
> + if (icnt1 == -1 || icnt1 == 1) {
> + Label CH1_LOOP, HAS_ZERO;
> + Label DO1_SHORT, DO1_LOOP;
> +
> + BIND(DO1);
> + ldrh(ch1, str1);
> + cmp(cnt2, 4);
> + br(LT, DO1_SHORT);
> +
> + orr(ch1, ch1, ch1, LSL, 16);
> + orr(ch1, ch1, ch1, LSL, 32);
> +
> + sub(cnt2, cnt2, 4);
> + mov(result_tmp, cnt2);
> + lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> + sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> + mov(tmp3, 0x0001000100010001);
> + BIND(CH1_LOOP);
> + ldr(ch2, Address(str2, cnt2_neg));
> + eor(ch2, ch1, ch2);
> + sub(tmp1, ch2, tmp3);
> + orr(tmp2, ch2, 0x7fff7fff7fff7fff);
> + bics(tmp1, tmp1, tmp2);
> + br(NE, HAS_ZERO);
> + adds(cnt2_neg, cnt2_neg, 8);
> + br(LT, CH1_LOOP);
> +
> + cmp(cnt2_neg, 8);
> + mov(cnt2_neg, 0);
> + br(LT, CH1_LOOP);
> + b(NOMATCH);
> +
> + BIND(HAS_ZERO);
> + rev(tmp1, tmp1);
> + clz(tmp1, tmp1);
> + add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
> + add(result, result_tmp, cnt2_neg, ASR, 1);
> + b(DONE);
> +
> + BIND(DO1_SHORT);
> + mov(result_tmp, cnt2);
> + lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> + sub(cnt2_neg, zr, cnt2, LSL, 1);
> + BIND(DO1_LOOP);
> + ldrh(ch2, Address(str2, cnt2_neg));
> + cmpw(ch1, ch2);
> + br(EQ, MATCH);
> + adds(cnt2_neg, cnt2_neg, 2);
> + br(LT, DO1_LOOP);
> + }
> + }
> + BIND(NOMATCH);
> + mov(result, -1);
> + b(DONE);
> + BIND(MATCH);
> + add(result, result_tmp, cnt2_neg, ASR, 1);
> + BIND(DONE);
> +}
> +
> // Compare strings.
> void MacroAssembler::string_compare(Register str1, Register str2,
> Register cnt1, Register cnt2, Register result,
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
> --- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Mon Aug 18 17:10:29 2014 +0100
> @@ -1091,6 +1091,11 @@
> Register len, Register result,
> FloatRegister Vtmp1, FloatRegister Vtmp2,
> FloatRegister Vtmp3, FloatRegister Vtmp4);
> + void string_indexof(Register str1, Register str2,
> + Register cnt1, Register cnt2,
> + Register tmp1, Register tmp2,
> + Register tmp3, Register tmp4,
> + int int_cnt1, Register result);
> };
>
> // Used by aarch64.ad to control code generation
> --- CUT HERE ---
>
>
More information about the aarch64-port-dev
mailing list