[aarch64-port-dev ] RFR: Add support for String.indexOf

Andrew Haley aph at redhat.com
Tue Aug 19 09:18:47 UTC 2014


Hi,

On 18/08/14 17:27, Edward Nevill wrote:
> 
> The following patch adds support for the String.indexOf intrinsic.
> 
> It uses a combination of 2 algorithms, a simplified Boyer Moore, and
> a straightforward linear scan.

In what way is this algorithm simplified from standard Boyer-Moore?
How can we have confidence it is correct?

> Boyer Moore is used when the pattern length is >= 8 and the source
> length is >= 4 * pattern length. This heuristic seems to produce the
> best results from benchmarking.
> 
> Essentially Boyer Moore generates the best results when the pattern
> length is short relative to the source but is sufficiently long to
> make the skip span worthwhile. As the pattern length grows towards
> the source length the initialisation of the pattern string tends to
> dominate.
> 
> The code also contains special case code for 1, 2, 3, and 4 chars in
> the pattern.
> 
> Benchmarking shows 1.96 X improvement on my synthetic benchmark
> which searches for patterns of length { 1, 2, 3, 4, 8, 12, 16, 24,
> 32, 64 } in a source string of length { 65, 129, 964 }. None of the
> patterns match in the benchmark but the patterns are contrived so
> that the initial substring of the pattern does match at several
> places in the source string.
> 
> On the hotspot indexof test it show 1.89 X improvement.
> 
> Tests as usual with jtreg/hotspot.
> 
> OK to push?

Some comments inline.

I'm a bit concerned that we always inline this code.  AFAICS if
icnt1 == -1 we might as well branch to a stub.

Andrew.


> --- CUT HERE ---
> # HG changeset patch
> # User Edward Nevill edward.nevill at linaro.org
> # Date 1408378229 -3600
> #      Mon Aug 18 17:10:29 2014 +0100
> # Node ID 978b74cb5c7b233477d5ce9346c038fbccd80869
> # Parent  2dfe9abe27fe6c9231e64ebd66e7ca3e6fb5b2bf
> Add support for String.indexOf intrinsic
> 
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/aarch64.ad
> --- a/src/cpu/aarch64/vm/aarch64.ad	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Aug 18 17:10:29 2014 +0100
> @@ -3415,6 +3415,16 @@
>    interface(CONST_INTER);
>  %}
>  
> +operand immI_le_4()
> +%{
> +  predicate(n->get_int() <= 4);
> +  match(ConI);
> +
> +  op_cost(0);
> +  format %{ %}
> +  interface(CONST_INTER);
> +%}
> +
>  operand immI_31()
>  %{
>    predicate(n->get_int() == 31);
> @@ -11733,6 +11743,44 @@
>    ins_pipe(pipe_class_memory);
>  %}
>  
> +instruct string_indexof(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
> +       iRegI_R0 result, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
> +%{
> +  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
> +  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
> +         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
> +  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result" %}
> +
> +  ins_encode %{
> +    __ string_indexof($str1$$Register, $str2$$Register,
> +                      $cnt1$$Register, $cnt2$$Register,
> +                      $tmp1$$Register, $tmp2$$Register,
> +                      $tmp3$$Register, $tmp4$$Register,
> +                      -1, $result$$Register);
> +  %}
> +  ins_pipe(pipe_class_memory);
> +%}
> +
> +instruct string_indexof_con(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
> +                 immI_le_4 int_cnt2, iRegI_R0 result, iRegI tmp1, iRegI tmp2,
> +                 iRegI tmp3, iRegI tmp4, rFlagsReg cr)
> +%{
> +  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
> +  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
> +         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
> +  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result" %}
> +
> +  ins_encode %{
> +    int icnt2 = (int)$int_cnt2$$constant;
> +    __ string_indexof($str1$$Register, $str2$$Register,
> +                      $cnt1$$Register, zr,
> +                      $tmp1$$Register, $tmp2$$Register,
> +                      $tmp3$$Register, $tmp4$$Register,
> +                      icnt2, $result$$Register);
> +  %}
> +  ins_pipe(pipe_class_memory);
> +%}
> +
>  instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
>                          iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
>  %{
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/vm_version_aarch64.cpp
> --- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Aug 18 17:10:29 2014 +0100
> @@ -105,6 +105,7 @@
>    FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
>    FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
>    FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
> +  FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
>  
>  #ifndef BUILTIN_SIM
>    unsigned long auxv = getauxval(AT_HWCAP);
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
> --- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Aug 18 17:10:29 2014 +0100
> @@ -1,3 +1,4 @@
> +/*
>  /*
>   * Copyright (c) 2013, Red Hat Inc.
>   * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
> @@ -3386,6 +3387,338 @@
>    }
>  }
>  
> +// Search for str1 in str2 and return index or -1
> +void MacroAssembler::string_indexof(Register str2, Register str1,
> +                                    Register cnt2, Register cnt1,
> +                                    Register tmp1, Register tmp2,
> +                                    Register tmp3, Register tmp4,
> +                                    int icnt1, Register result) {
> +  Label BM, LS, DONE, NOMATCH, MATCH;
> +
> +  Register ch1 = rscratch1;
> +  Register ch2 = rscratch2;
> +  Register cnt1tmp = tmp1;
> +  Register cnt2tmp = tmp2;
> +  Register cnt1_neg = cnt1;
> +  Register cnt2_neg = cnt2;
> +  Register result_tmp = tmp4;
> +
> +  // Note, inline_string_indexOf() generates checks:
> +  // if (substr.count > string.count) return -1;
> +  // if (substr.count == 0) return 0;
> +
> +// We have two strings, a source string in str2, cnt2 and a pattern string
> +// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
> +
> +// For larger pattern and source we use a simplified Boyer Moore algorithm.
> +// With a small pattern and source we use linear scan.
> +
> +  if (icnt1 == -1) {
> +    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
> +    ccmp(cnt1, 8, 0b0000, CC);  // Can't handle skip >= 256 because we use

Can this CC be LO, please.  This is really important because the ARM
carry flag is inverted compared with e.g. Intel.

> +    br(CC, LS);                 // a byte array.
> +    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
> +    br(CS, LS);

No, you may not give a label the same name as one of the condition
codes.  This is an unexploded bomb.

> +  }
> +
> +// Larger pattern and source use the following Boyer Moore alogorithm.
> +//
> +// #define ASIZE 128
> +//
> +//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
> +//       int i, j;
> +//       unsigned c;
> +//       unsigned char bc[ASIZE];
> +//    
> +//       /* Preprocessing */
> +//       for (i = 0; i < ASIZE; ++i)
> +//          bc[i] = 0;
> +//       for (i = 0; i < m - 1; ) {
> +//          c = x[i];
> +//          ++i;
> +//          if (c < ASIZE) bc[c] = i;
> +//       }
> +//    
> +//       /* Searching */
> +//       j = 0;
> +//       while (j <= n - m) {
> +//          c = y[i+j];
> +//          if (x[m-1] == c)
> +//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
> +//          if (i < 0) return j;
> +//          if (c < ASIZE)
> +//            j = j - bc[y[j+m-1]] + m;
> +//          else
> +//            j += 1; // Advance by 1 only if char >= ASIZE
> +//       }
> +//    }
> +
> +  if (icnt1 == -1) {
> +    BIND(BM);
> +
> +    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
> +    Label BMADV, BMMATCH, BMCHECKEND;
> +
> +    Register cnt1end = tmp2;
> +    Register str2end = cnt2;
> +    Register skipch = tmp2;
> +
> +    // Restrict ASIZE to 128 to reduce stack space/initialisation.
> +    // The presence of chars >= ASIZE in the target string does not affect
> +    // performance, but we must be careful not to initialise them in the stack
> +    // array.
> +    // The presence of chars >= ASIZE in the source string may adversely affect
> +    // performance since we can only advance by one when we encounter one.
> +
> +      stp(zr, zr, pre(sp, -128));
> +      stp(zr, zr, Address(sp, 1*16));
> +      stp(zr, zr, Address(sp, 2*16));
> +      stp(zr, zr, Address(sp, 3*16));
> +      stp(zr, zr, Address(sp, 4*16));
> +      stp(zr, zr, Address(sp, 5*16));
> +      stp(zr, zr, Address(sp, 6*16));
> +      stp(zr, zr, Address(sp, 7*16));

Why is this not a loop?

> +
> +      mov(cnt1tmp, 0);
> +      sub(cnt1end, cnt1, 1);
> +    BIND(BCLOOP);
> +      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> +      cmp(ch1, 128);
> +      add(cnt1tmp, cnt1tmp, 1);
> +      br(HS, BCSKIP);
> +      strb(cnt1tmp, Address(sp, ch1));
> +    BIND(BCSKIP);
> +      cmp(cnt1tmp, cnt1end);
> +      br(LT, BCLOOP);
> +
> +      mov(result_tmp, str2);
> +
> +      sub(cnt2, cnt2, cnt1);
> +      add(str2end, str2, cnt2, LSL, 1);
> +    BIND(BMLOOPSTR2);
> +      sub(cnt1tmp, cnt1, 1);
> +      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> +      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
> +      cmp(ch1, skipch);
> +      br(NE, BMSKIP);
> +      subs(cnt1tmp, cnt1tmp, 1);
> +      br(LT, BMMATCH);
> +    BIND(BMLOOPSTR1);
> +      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> +      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
> +      cmp(ch1, ch2);
> +      br(NE, BMSKIP);
> +      subs(cnt1tmp, cnt1tmp, 1);
> +      br(GE, BMLOOPSTR1);
> +    BIND(BMMATCH);
> +      sub(result_tmp, str2, result_tmp);
> +      lsr(result, result_tmp, 1);
> +      add(sp, sp, 128);
> +      b(DONE);
> +    BIND(BMADV);
> +      add(str2, str2, 2);
> +      b(BMCHECKEND);
> +    BIND(BMSKIP);
> +      cmp(skipch, 128);
> +      br(HS, BMADV);
> +      ldrb(ch2, Address(sp, skipch));
> +      add(str2, str2, cnt1, LSL, 1);
> +      sub(str2, str2, ch2, LSL, 1);
> +    BIND(BMCHECKEND);
> +      cmp(str2, str2end);
> +      br(LE, BMLOOPSTR2);
> +      add(sp, sp, 128);
> +      b(NOMATCH);
> +  }
> +
> +  BIND(LS);
> +  {
> +    Label DO1, DO2, DO3;
> +
> +    Register str2tmp = tmp2;
> +    Register first = tmp3;
> +
> +    if (icnt1 == -1)
> +    {
> +        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
> +
> +        cmp(cnt1, 4);
> +        br(LT, DOSHORT);
> +
> +        sub(cnt2, cnt2, cnt1);
> +        sub(cnt1, cnt1, 4);
> +        mov(result_tmp, cnt2);
> +
> +        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt1_neg, zr, cnt1, LSL, 1);
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +        ldr(first, Address(str1, cnt1_neg));
> +
> +      BIND(FIRST_LOOP);
> +        ldr(ch2, Address(str2, cnt2_neg));
> +        cmp(first, ch2);
> +        br(EQ, STR1_LOOP);
> +      BIND(STR2_NEXT);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, FIRST_LOOP);
> +        b(NOMATCH);
> +
> +      BIND(STR1_LOOP);
> +        adds(cnt1tmp, cnt1_neg, 8);
> +        add(cnt2tmp, cnt2_neg, 8);
> +        br(GE, LAST_WORD);
> +
> +      BIND(STR1_NEXT);
> +        ldr(ch1, Address(str1, cnt1tmp));
> +        ldr(ch2, Address(str2, cnt2tmp));
> +        cmp(ch1, ch2);
> +        br(NE, STR2_NEXT);
> +        adds(cnt1tmp, cnt1tmp, 8);
> +        add(cnt2tmp, cnt2tmp, 8);
> +        br(LT, STR1_NEXT);
> +
> +      BIND(LAST_WORD);
> +        ldr(ch1, Address(str1));
> +        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
> +        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
> +        cmp(ch1, ch2);
> +        br(NE, STR2_NEXT);
> +        b(MATCH);
> +
> +      BIND(DOSHORT);
> +        cmp(cnt1, 2);
> +        br(LT, DO1);
> +        br(GT, DO3);
> +    }
> +
> +    if (icnt1 == 4) {
> +      Label CH1_LOOP;
> +
> +        ldr(ch1, str1);
> +        sub(cnt2, cnt2, 4);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +      BIND(CH1_LOOP);
> +        ldr(ch2, Address(str2, cnt2_neg));
> +        cmp(ch1, ch2);
> +        br(EQ, MATCH);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, CH1_LOOP);
> +        b(NOMATCH);
> +    }
> +
> +    if (icnt1 == -1 || icnt1 == 2) {
> +      Label CH1_LOOP;
> +
> +      BIND(DO2);
> +        ldrw(ch1, str1);
> +        sub(cnt2, cnt2, 2);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +      BIND(CH1_LOOP);
> +        ldrw(ch2, Address(str2, cnt2_neg));
> +        cmp(ch1, ch2);
> +        br(EQ, MATCH);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, CH1_LOOP);
> +        b(NOMATCH);
> +    }
> +
> +    if (icnt1 == -1 || icnt1 == 3) {
> +      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
> +
> +      BIND(DO3);
> +        ldrw(first, str1);
> +        ldrh(ch1, Address(str1, 4));
> +
> +        sub(cnt2, cnt2, 3);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +      BIND(FIRST_LOOP);
> +        ldrw(ch2, Address(str2, cnt2_neg));
> +        cmpw(first, ch2);
> +        br(EQ, STR1_LOOP);
> +      BIND(STR2_NEXT);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, FIRST_LOOP);
> +        b(NOMATCH);
> +
> +      BIND(STR1_LOOP);
> +        add(cnt2tmp, cnt2_neg, 4);
> +        ldrh(ch2, Address(str2, cnt2tmp));
> +        cmp(ch1, ch2);
> +        br(NE, STR2_NEXT);
> +        add(result, result_tmp, cnt2_neg, ASR, 1);
> +        b(DONE);
> +    }
> +
> +    if (icnt1 == -1 || icnt1 == 1) {
> +      Label CH1_LOOP, HAS_ZERO;
> +      Label DO1_SHORT, DO1_LOOP;
> +
> +      BIND(DO1);
> +        ldrh(ch1, str1);
> +        cmp(cnt2, 4);
> +        br(LT, DO1_SHORT);
> +
> +        orr(ch1, ch1, ch1, LSL, 16);
> +        orr(ch1, ch1, ch1, LSL, 32);
> +
> +        sub(cnt2, cnt2, 4);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +        mov(tmp3, 0x0001000100010001);
> +      BIND(CH1_LOOP);
> +        ldr(ch2, Address(str2, cnt2_neg));
> +        eor(ch2, ch1, ch2);
> +        sub(tmp1, ch2, tmp3);
> +        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
> +        bics(tmp1, tmp1, tmp2);
> +        br(NE, HAS_ZERO);
> +        adds(cnt2_neg, cnt2_neg, 8);
> +        br(LT, CH1_LOOP);
> +
> +        cmp(cnt2_neg, 8);
> +        mov(cnt2_neg, 0);
> +        br(LT, CH1_LOOP);
> +        b(NOMATCH);
> +
> +      BIND(HAS_ZERO);
> +        rev(tmp1, tmp1);
> +        clz(tmp1, tmp1);
> +        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
> +        add(result, result_tmp, cnt2_neg, ASR, 1);
> +        b(DONE);
> +
> +      BIND(DO1_SHORT);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +      BIND(DO1_LOOP);
> +        ldrh(ch2, Address(str2, cnt2_neg));
> +        cmpw(ch1, ch2);
> +        br(EQ, MATCH);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LT, DO1_LOOP);
> +    }
> +  }
> +  BIND(NOMATCH);
> +    mov(result, -1);
> +    b(DONE);
> +  BIND(MATCH);
> +    add(result, result_tmp, cnt2_neg, ASR, 1);
> +  BIND(DONE);
> +}
> +
>  // Compare strings.
>  void MacroAssembler::string_compare(Register str1, Register str2,
>                                      Register cnt1, Register cnt2, Register result,
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
> --- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Aug 18 17:10:29 2014 +0100
> @@ -1091,6 +1091,11 @@
>                          Register len, Register result,
>                          FloatRegister Vtmp1, FloatRegister Vtmp2,
>                          FloatRegister Vtmp3, FloatRegister Vtmp4);
> +  void string_indexof(Register str1, Register str2,
> +                      Register cnt1, Register cnt2,
> +                      Register tmp1, Register tmp2,
> +                      Register tmp3, Register tmp4,
> +                      int int_cnt1, Register result);
>  };
>  
>  // Used by aarch64.ad to control code generation
> --- CUT HERE ---
> 
> 



More information about the aarch64-port-dev mailing list