RFR(M): 8223051: support loops with long (64b) trip counts

Fri Aug 21 00:12:23 UTC 2020

On Aug 20, 2020, at 5:51 AM, Roland Westrelin <rwestrel at redhat.com> wrote:
> 
> Hi John,
> 
>> I’m going over it one more time (between smoky breaths from
>> the California fires) and I have a question.  What is the exact
>> structure of outer_phi?
> 
> Thanks for taking another look at this.
> 
>> 1. At first it is a clone of phi, its region patched and the other edges the same:
>> 
>>    outer_phi := Phi(outer_head, init, AddL(phi, stride))
> 
> It's only a clone so:
> 
> outer_phi := Phi(outer_head, init, AddI(phi, stride))
> 
> (that is no AddL)

I’m not sure what you mean?  The original incr is an AddL,
since we are transforming a long loop.  The AddI goes
somewhere else in the transformed code.

But, yes, the first step is just to make a cloned phi and patch
it into the outer loop.

> 
>> 2. After long_loop_replace_long_iv, the interior phi links it back to itself:
>> 
>>    outer_phi := Phi(outer_head, init, AddL(AddL(inner_phi, outer_phi), stride))
> 
> I don't think that's right. There are 2 calls to
> long_loop_replace_long_iv(). One to replace phi and the other one to
> replace incr (that is the AddI above).

Right; I missed the fact that the second replace_long_iv step
replaces the AddL completely.  So while phi is replaced by
AddL(I2L(inner_phi), outer_phi), incr is replaced by
AddL(I2L(inner_incr), outer_phi).

phi := Phi(x, init, incr)  =>  AddL(I2L(inner_phi), outer_phi)
incr := AddL(phi, stride)  =>  AddL(I2L(inner_incr), outer_phi)

And the effect of those replacements on outer_phi (the patched
clone of phi) is:

outer_phi := Phi(outer_head, init, <<incr=AddL(phi, longcon(stride))>>)
  =>  Phi(outer_head, init, <<AddL(I2L(inner_incr), outer_phi)>>)
  =>
outer_phi := Phi(outer_head, init, AddL(I2L(AddI(inner_phi, intcon(stride))), outer_phi))

not (as I was said previously):

outer_phi := Phi(outer_head, init, AddL(<<phi>>, longcon(stride)))
  =>  Phi(outer_head, init, AddL(<<AddL(I2L(inner_phi), outer_phi)>>, longcon(stride)))

And, in the corrected transform, there is no worrying extra
addition of stride (using AddL directly).

> 
> outer_phi := Phi(outer_head, init, AddL(I2L(AddI(inner_phi, stride)), outer_phi))
> 
> (actually this is not quite accurate because peeling one iteration
> causes an extra phi to be added to merge the peeled iteration with the
> counted loop in most cases).
> 
> Do you see a problem with the above outer_phi structure?

Not any more.  Let’s just make sure the transform gets exercised, OK?

After the P.S. is an amended chunk of pseudocode showing how it works.
I created it by labeling the various expressions in the example loop
with the names used in is_long_counted_loop, and then I stepped
through is_long_counted_loop and edited the pseudocode to
reflect each step.  If you agree I did it correctly, and that it helps
explain the code, you could place it as a comment at bottom, just
before the final peel.  Otherwise, we can just leave it here FTR.

I do have this specific request:  Please replace the pseudocode
at the top (already in the webrev) with the following corrected
pseudocode.  It uses names more consistent with the actual C++
code and corresponds more accurately to the transformed IR.

```
// range of long values from the initial loop in (at most) max int
// steps. That is:

x: for (long phi = init; phi < limit; phi += stride) {
  // phi := Phi(L, init, incr)
  // incr := AddL(phi, longcon(stride))
  // phi_incr := phi (test happens before increment)
  long incr = phi + stride;
  ... use phi and incr ...
}

OR:

x: for (long phi = init; (phi += stride) < limit; ) {
  // phi := Phi(L, AddL(init, stride), incr)
  // incr := AddL(phi, longcon(stride))
  // phi_incr := NULL (test happens after increment)
  long incr = phi + stride;
  ... use phi and (phi + stride) ...
}

==transform=>

const ulong inner_iters_limit = INT_MAX - stride - 1;  //near 0x7FFFFFF0
assert(stride <= inner_iters_limit);  // else abort transform
assert((extralong)limit + stride <= LONG_MAX);  // else deopt
outer_head: for (long outer_phi = init;;) {
  // outer_phi := Phi(outer_head, init, AddL(outer_phi, I2L(inner_phi)))
  ulong inner_iters_max = (ulong) MAX(0, ((extralong)limit + stride - outer_phi));
  long inner_iters_actual = MIN(inner_iters_limit, inner_iters_max);
  assert(inner_iters_actual == (int)inner_iters_actual);
  int inner_phi, inner_incr;
  x: for (inner_phi = 0;; inner_phi = inner_incr) {
    // inner_phi := Phi(x, intcon(0), inner_incr)
    // inner_incr := AddI(inner_phi, intcon(stride))
    inner_incr = inner_phi + stride;
    if (inner_incr < inner_iters_actual) {
      ... use phi=>(outer_phi+inner_phi) and incr=>(outer_phi+inner_incr) ...
      continue;
    }
    else break;
  }
  if ((outer_phi+inner_phi) < limit)  //OR (outer_phi+inner_incr) < limit
    continue;
  else break;
}
```

Thanks!

— John

P.S. Here are the intermediate steps, annotated with the C++
variable names for the various nodes, and with the steps that
created the transformed loop nodes.

== old IR nodes =>

entry_control: {...}
x:
for (long phi = init;;) {
  // phi := Phi(x, init, incr)
  // incr := AddL(phi, longcon(stride))
  exit_test:
  if (phi < limit)
    back_control: fallthrough;
  else
    exit_branch: break;
  // test happens before increment => phi == phi_incr != NULL
  long incr = phi + stride;
  ... use phi and incr ...
  phi = incr;
}

== new IR nodes (just before final peel) =>

entry_control: {...}
long adjusted_limit = limit + stride;  //because phi_incr != NULL
assert(!limit_check_required || (extralong)limit + stride == adjusted_limit);  // else deopt
ulong inner_iters_limit = max_jint - ABS(stride) - 1;  //near 0x7FFFFFF0
outer_head:
for (long outer_phi = init;;) {
  // outer_phi := phi->clone(), in(0):=outer_head, => Phi(outer_head, init, incr)
  // REPLACE phi  => AddL(outer_phi, I2L(inner_phi))
  // REPLACE incr => AddL(outer_phi, I2L(inner_incr))
  // SO THAT outer_phi := Phi(outer_head, init, AddL(outer_phi, I2L(inner_incr)))
  ulong inner_iters_max = (ulong) MAX(0, ((extralong)adjusted_limit - outer_phi) * SGN(stride));
  int inner_iters_actual_int = (int) MIN(inner_iters_limit, inner_iters_max) * SGN(stride);
  inner_head: x: //in(1) := outer_head
  int inner_phi;
  for (inner_phi = 0;;) {
    // inner_phi := Phi(x, intcon(0), inner_phi + stride)
    int inner_incr = inner_phi + stride;
    bool inner_bol = (inner_incr < inner_iters_actual_int);
    exit_test: //exit_test->in(1) := inner_bol; 
    if (inner_bol) // WAS (phi < limit)
      back_control: fallthrough;
    else
      inner_exit_branch: break;  //exit_branch->clone()
    ... use phi=>(outer_phi+inner_phi) and incr=>(outer_phi+inner_incr) ...
    inner_phi = inner_phi + stride;  // inner_incr
  }
  outer_exit_test:  //exit_test->clone(), in(0):=inner_exit_branch
  if ((outer_phi+inner_phi) < limit)  // WAS (phi < limit)
    outer_back_branch: fallthrough;  //back_control->clone(), in(0):=outer_exit_test
  else
    exit_branch: break;  //in(0) := outer_exit_test
}