RFR(M): 8223051: support loops with long (64b) trip counts
John Rose
john.r.rose at oracle.com
Fri Aug 21 00:12:23 UTC 2020
On Aug 20, 2020, at 5:51 AM, Roland Westrelin <rwestrel at redhat.com> wrote:
>
> Hi John,
>
>> I’m going over it one more time (between smoky breaths from
>> the California fires) and I have a question. What is the exact
>> structure of outer_phi?
>
> Thanks for taking another look at this.
>
>> 1. At first it is a clone of phi, its region patched and the other edges the same:
>>
>> outer_phi := Phi(outer_head, init, AddL(phi, stride))
>
> It's only a clone so:
>
> outer_phi := Phi(outer_head, init, AddI(phi, stride))
>
> (that is no AddL)
I’m not sure what you mean? The original incr is an AddL,
since we are transforming a long loop. The AddI goes
somewhere else in the transformed code.
But, yes, the first step is just to make a cloned phi and patch
it into the outer loop.
>
>> 2. After long_loop_replace_long_iv, the interior phi links it back to itself:
>>
>> outer_phi := Phi(outer_head, init, AddL(AddL(inner_phi, outer_phi), stride))
>
> I don't think that's right. There are 2 calls to
> long_loop_replace_long_iv(). One to replace phi and the other one to
> replace incr (that is the AddI above).
Right; I missed the fact that the second replace_long_iv step
replaces the AddL completely. So while phi is replaced by
AddL(I2L(inner_phi), outer_phi), incr is replaced by
AddL(I2L(inner_incr), outer_phi).
phi := Phi(x, init, incr) => AddL(I2L(inner_phi), outer_phi)
incr := AddL(phi, stride) => AddL(I2L(inner_incr), outer_phi)
And the effect of those replacements on outer_phi (the patched
clone of phi) is:
outer_phi := Phi(outer_head, init, <<incr=AddL(phi, longcon(stride))>>)
=> Phi(outer_head, init, <<AddL(I2L(inner_incr), outer_phi)>>)
=>
outer_phi := Phi(outer_head, init, AddL(I2L(AddI(inner_phi, intcon(stride))), outer_phi))
not (as I was said previously):
outer_phi := Phi(outer_head, init, AddL(<<phi>>, longcon(stride)))
=> Phi(outer_head, init, AddL(<<AddL(I2L(inner_phi), outer_phi)>>, longcon(stride)))
And, in the corrected transform, there is no worrying extra
addition of stride (using AddL directly).
>
> outer_phi := Phi(outer_head, init, AddL(I2L(AddI(inner_phi, stride)), outer_phi))
>
> (actually this is not quite accurate because peeling one iteration
> causes an extra phi to be added to merge the peeled iteration with the
> counted loop in most cases).
>
> Do you see a problem with the above outer_phi structure?
Not any more. Let’s just make sure the transform gets exercised, OK?
After the P.S. is an amended chunk of pseudocode showing how it works.
I created it by labeling the various expressions in the example loop
with the names used in is_long_counted_loop, and then I stepped
through is_long_counted_loop and edited the pseudocode to
reflect each step. If you agree I did it correctly, and that it helps
explain the code, you could place it as a comment at bottom, just
before the final peel. Otherwise, we can just leave it here FTR.
I do have this specific request: Please replace the pseudocode
at the top (already in the webrev) with the following corrected
pseudocode. It uses names more consistent with the actual C++
code and corresponds more accurately to the transformed IR.
```
// range of long values from the initial loop in (at most) max int
// steps. That is:
x: for (long phi = init; phi < limit; phi += stride) {
// phi := Phi(L, init, incr)
// incr := AddL(phi, longcon(stride))
// phi_incr := phi (test happens before increment)
long incr = phi + stride;
... use phi and incr ...
}
OR:
x: for (long phi = init; (phi += stride) < limit; ) {
// phi := Phi(L, AddL(init, stride), incr)
// incr := AddL(phi, longcon(stride))
// phi_incr := NULL (test happens after increment)
long incr = phi + stride;
... use phi and (phi + stride) ...
}
==transform=>
const ulong inner_iters_limit = INT_MAX - stride - 1; //near 0x7FFFFFF0
assert(stride <= inner_iters_limit); // else abort transform
assert((extralong)limit + stride <= LONG_MAX); // else deopt
outer_head: for (long outer_phi = init;;) {
// outer_phi := Phi(outer_head, init, AddL(outer_phi, I2L(inner_phi)))
ulong inner_iters_max = (ulong) MAX(0, ((extralong)limit + stride - outer_phi));
long inner_iters_actual = MIN(inner_iters_limit, inner_iters_max);
assert(inner_iters_actual == (int)inner_iters_actual);
int inner_phi, inner_incr;
x: for (inner_phi = 0;; inner_phi = inner_incr) {
// inner_phi := Phi(x, intcon(0), inner_incr)
// inner_incr := AddI(inner_phi, intcon(stride))
inner_incr = inner_phi + stride;
if (inner_incr < inner_iters_actual) {
... use phi=>(outer_phi+inner_phi) and incr=>(outer_phi+inner_incr) ...
continue;
}
else break;
}
if ((outer_phi+inner_phi) < limit) //OR (outer_phi+inner_incr) < limit
continue;
else break;
}
```
Thanks!
— John
P.S. Here are the intermediate steps, annotated with the C++
variable names for the various nodes, and with the steps that
created the transformed loop nodes.
== old IR nodes =>
entry_control: {...}
x:
for (long phi = init;;) {
// phi := Phi(x, init, incr)
// incr := AddL(phi, longcon(stride))
exit_test:
if (phi < limit)
back_control: fallthrough;
else
exit_branch: break;
// test happens before increment => phi == phi_incr != NULL
long incr = phi + stride;
... use phi and incr ...
phi = incr;
}
== new IR nodes (just before final peel) =>
entry_control: {...}
long adjusted_limit = limit + stride; //because phi_incr != NULL
assert(!limit_check_required || (extralong)limit + stride == adjusted_limit); // else deopt
ulong inner_iters_limit = max_jint - ABS(stride) - 1; //near 0x7FFFFFF0
outer_head:
for (long outer_phi = init;;) {
// outer_phi := phi->clone(), in(0):=outer_head, => Phi(outer_head, init, incr)
// REPLACE phi => AddL(outer_phi, I2L(inner_phi))
// REPLACE incr => AddL(outer_phi, I2L(inner_incr))
// SO THAT outer_phi := Phi(outer_head, init, AddL(outer_phi, I2L(inner_incr)))
ulong inner_iters_max = (ulong) MAX(0, ((extralong)adjusted_limit - outer_phi) * SGN(stride));
int inner_iters_actual_int = (int) MIN(inner_iters_limit, inner_iters_max) * SGN(stride);
inner_head: x: //in(1) := outer_head
int inner_phi;
for (inner_phi = 0;;) {
// inner_phi := Phi(x, intcon(0), inner_phi + stride)
int inner_incr = inner_phi + stride;
bool inner_bol = (inner_incr < inner_iters_actual_int);
exit_test: //exit_test->in(1) := inner_bol;
if (inner_bol) // WAS (phi < limit)
back_control: fallthrough;
else
inner_exit_branch: break; //exit_branch->clone()
... use phi=>(outer_phi+inner_phi) and incr=>(outer_phi+inner_incr) ...
inner_phi = inner_phi + stride; // inner_incr
}
outer_exit_test: //exit_test->clone(), in(0):=inner_exit_branch
if ((outer_phi+inner_phi) < limit) // WAS (phi < limit)
outer_back_branch: fallthrough; //back_control->clone(), in(0):=outer_exit_test
else
exit_branch: break; //in(0) := outer_exit_test
}
More information about the hotspot-compiler-dev
mailing list