RFR(M): 8145322: Code generated from unsafe loops can be slightly improved

Mon Jan 11 15:36:52 UTC 2016

Thanks for the review, Vladimir and Tobias.

Roland.

> On Dec 16, 2015, at 2:38 AM, Vladimir Kozlov <vladimir.kozlov at oracle.com> wrote:
> 
> Very nice!
> 
> You may need to change code in castnode.cpp according new changes 8145096 if they pushed first (not yet).
> And also 32-bit as Tobias pointed.
> 
> Thanks,
> Vladimir
> 
> On 12/15/15 12:55 AM, Roland Westrelin wrote:
>> Hi Vladimir,
>> 
>> Thanks for looking at this.
>> 
>>> Second assembler output still have intermediate increments and also new movslq instructions. Why it should be better.
>> 
>> I thinks there is some confusion here. There are 2 problems I’d like to fix. One is when using checkIndex. In that case, the code should be as good as regular array accesses. The first assembly dump shows it’s not. The second problem is when not using checkIndex but we know the loop bounds, should be able to do better. That’s the second assembly dump. In my email I only showed assembly without my change. With my change:
>> 
>> first test case:
>> 
>> 0c2   B11: #    B37 B12 <- B8 B10       Loop: B11-B10 inner main of N142 Freq: 975.841
>> 0c2     movq    RAX, [RSI + #16 + RDI << #3]    # long
>> 0c7     movq    RBX, [R9 + #16 + RDI << #3]     # long
>> 0cc     cmpq    RBX, RAX
>> 0cf     jne     B37  P=0.000000 C=7836.000000
>> 0cf
>> 0d5   B12: #    B38 B13 <- B11  Freq: 975.84
>> 0d5     movq    RAX, [RSI + #24 + RDI << #3]    # long
>> 0da     movq    RBX, [R9 + #24 + RDI << #3]     # long
>> 0df     cmpq    RBX, RAX
>> 0e2     jne     B38  P=0.000000 C=7836.000000
>> 0e2
>> 0e8   B13: #    B40 B14 <- B12  Freq: 975.84
>> 0e8     movq    RAX, [RSI + #32 + RDI << #3]    # long
>> 0ed     movq    RBX, [R9 + #32 + RDI << #3]     # long
>> 0f2     cmpq    RBX, RAX
>> 0f5     jne     B40  P=0.000000 C=7836.000000
>> 0f5
>> 0fb   B14: #    B42 B15 <- B13  Freq: 975.84
>> 0fb     movq    RAX, [RSI + #40 + RDI << #3]    # long
>> 100     movq    RBX, [R9 + #40 + RDI << #3]     # long
>> 105     cmpq    RBX, RAX
>> 108     jne     B42  P=0.000000 C=7836.000000
>> 108
>> 10e   B15: #    B44 B16 <- B14  Freq: 975.839
>> 10e     movq    RAX, [RSI + #48 + RDI << #3]    # long
>> 113     movq    RBX, [R9 + #48 + RDI << #3]     # long
>> 118     movl    RDX, RDI        # spill
>> 11a     addl    RDX, #4 # int
>> 11d     cmpq    RBX, RAX
>> 120     jne     B44  P=0.000000 C=7836.000000
>> 120
>> 126   B16: #    B39 B17 <- B15  Freq: 975.839
>> 126     movq    RAX, [RSI + #56 + RDI << #3]    # long
>> 12b     movq    RBX, [R9 + #56 + RDI << #3]     # long
>> 130     cmpq    RBX, RAX
>> 133     jne     B39  P=0.000000 C=7836.000000
>> 133
>> 139   B17: #    B41 B18 <- B16  Freq: 975.838
>> 139     movq    RAX, [RSI + #64 + RDI << #3]    # long
>> 13e     movq    RBX, [R9 + #64 + RDI << #3]     # long
>> 143     cmpq    RBX, RAX
>> 146     jne     B41  P=0.000000 C=7836.000000
>> 146
>> 14c   B18: #    B43 B19 <- B17  Freq: 975.838
>> 14c     movq    RAX, [RSI + #72 + RDI << #3]    # long
>> 151     movq    RBX, [R9 + #72 + RDI << #3]     # long
>> 156     cmpq    RBX, RAX
>> 159     jne     B43  P=0.000000 C=7836.000000
>> 159
>> 15f   B19: #    B10 B20 <- B18  Freq: 975.837
>> 15f     movl    RDX, RDI        # spill
>> 161     addl    RDX, #8 # int
>> 164     cmpl    RDX, RBP
>> 166     jl     B10      # loop end  P=0.998980 C=7836.000000
>> 
>> 
>> 
>> second test case:
>> 
>> 0a3   B7: #     B32 B8 <- B6 B15        Loop: B7-B15 inner main of N123 Freq: 975.843
>> 0a3     movq    RDI, [RBP + #16 + RSI << #3]    # long
>> 0a8     movq    RAX, [RDX + #16 + RSI << #3]    # long
>> 0ad     cmpq    RAX, RDI
>> 0b0     jne     B32  P=0.000000 C=7836.000000
>> 0b0
>> 0b6   B8: #     B33 B9 <- B7  Freq: 975.842
>> 0b6     movq    RDI, [RBP + #24 + RSI << #3]    # long
>> 0bb     movq    RAX, [RDX + #24 + RSI << #3]    # long
>> 0c0     cmpq    RAX, RDI
>> 0c3     jne     B33  P=0.000000 C=7836.000000
>> 0c3
>> 0c9   B9: #     B35 B10 <- B8  Freq: 975.842
>> 0c9     movq    RDI, [RBP + #32 + RSI << #3]    # long
>> 0ce     movq    RAX, [RDX + #32 + RSI << #3]    # long
>> 0d3     cmpq    RAX, RDI
>> 0d6     jne     B35  P=0.000000 C=7836.000000
>> 0d6
>> 0dc   B10: #    B39 B11 <- B9  Freq: 975.842
>> 0dc     movq    RDI, [RBP + #40 + RSI << #3]    # long
>> 0e1     movq    RAX, [RDX + #40 + RSI << #3]    # long
>> 0e6     cmpq    RAX, RDI
>> 0e9     jne     B39  P=0.000000 C=7836.000000
>> 0e9
>> 0ef   B11: #    B38 B12 <- B10  Freq: 975.841
>> 0ef     movq    RDI, [RBP + #48 + RSI << #3]    # long
>> 0f4     movq    RAX, [RDX + #48 + RSI << #3]    # long
>> 0f9     movl    R8, RSI # spill
>> 0fc     addl    R8, #4  # int
>> 100     cmpq    RAX, RDI
>> 103     jne     B38  P=0.000000 C=7836.000000
>> 103
>> 109   B12: #    B34 B13 <- B11  Freq: 975.841
>> 109     movq    RDI, [RBP + #56 + RSI << #3]    # long
>> 10e     movq    RAX, [RDX + #56 + RSI << #3]    # long
>> 113     cmpq    RAX, RDI
>> 116     jne     B34  P=0.000000 C=7836.000000
>> 116
>> 11c   B13: #    B36 B14 <- B12  Freq: 975.84
>> 11c     movq    RDI, [RBP + #64 + RSI << #3]    # long
>> 121     movq    RAX, [RDX + #64 + RSI << #3]    # long
>> 126     cmpq    RAX, RDI
>> 129     jne     B36  P=0.000000 C=7836.000000
>> 129
>> 12f   B14: #    B38 B15 <- B13  Freq: 975.84
>> 12f     movq    RDI, [RBP + #72 + RSI << #3]    # long
>> 134     movq    RAX, [RDX + #72 + RSI << #3]    # long
>> 139     movl    R8, RSI # spill
>> 13c     addl    R8, #7  # int
>> 140     cmpq    RAX, RDI
>> 143     jne     B38  P=0.000000 C=7836.000000
>> 143
>> 149   B15: #    B7 B16 <- B14  Freq: 975.839
>> 149     addl    RSI, #8 # int
>> 14c     cmpl    RSI, R11
>> 14f     jl     B7       # loop end  P=0.998980 C=7836.000000
>> 
>> Roland.
>> 
>>> 
>>> Thanks,
>>> Vladimir
>>> 
>>> On 12/14/15 8:42 AM, Roland Westrelin wrote:
>>>> http://cr.openjdk.java.net/~roland/8145322/webrev.00/
>>>> 
>>>> Paul spotted the following small inefficiencies:
>>>> 
>>>>         for (; wi < l; wi++) {
>>>>             long bi = ((long) Objects.checkIndex(wi, l, null)) << LOG2_ARRAY_LONG_INDEX_SCALE;
>>>>             long av = U.getLongUnaligned(a, aOffset + bi);
>>>>             long bv = U.getLongUnaligned(b, bOffset + bi);
>>>>             if (av != bv) {
>>>> 
>>>> is compiled to:
>>>> 
>>>> 0b0 B9: # B28 B10 <- B8 B13 Loop: B9-B13 inner main of N130 Freq: 977.661
>>>> 0b0 movl RDX, RDI # spill
>>>> 0b2 # castII of RDX
>>>> 0b2 movq RBX, [R9 + #16 + RDX << #3] # long
>>>> 0b7 movq RAX, [RSI + #16 + RDX << #3] # long
>>>> 0bc cmpq RBX, RAX
>>>> 0bf jne B28 P=0.000000 C=7836.000000
>>>> 0bf
>>>> 0c5 B10: # B28 B11 <- B9 Freq: 977.66
>>>> 0c5 movl RDX, RDI # spill
>>>> 0c7 incl RDX # int
>>>> 0c9 # castII of RDX
>>>> 0c9 movq RBX, [R9 + #16 + RDX << #3] # long
>>>> 0ce movq RAX, [RSI + #16 + RDX << #3] # long
>>>> 0d3 cmpq RBX, RAX
>>>> 0d6 jne B28 P=0.000000 C=7836.000000
>>>> 0d6
>>>> 0dc B11: # B28 B12 <- B10 Freq: 977.66
>>>> 0dc movl RDX, RDI # spill
>>>> 0de addl RDX, #2 # int
>>>> 0e1 # castII of RDX
>>>> 0e1 movq RBX, [R9 + #16 + RDX << #3] # long
>>>> 0e6 movq RAX, [RSI + #16 + RDX << #3] # long
>>>> 0eb cmpq RBX, RAX
>>>> 0ee jne B28 P=0.000000 C=7836.000000
>>>> 0ee
>>>> 0f4 B12: # B28 B13 <- B11 Freq: 977.659
>>>> 0f4 movl RDX, RDI # spill
>>>> 0f6 addl RDX, #3 # int
>>>> 0f9 # castII of RDX
>>>> 0f9 movq RBX, [R9 + #16 + RDX << #3] # long
>>>> 0fe movq RAX, [RSI + #16 + RDX << #3] # long
>>>> 103 cmpq RBX, RAX
>>>> 106 jne B28 P=0.000000 C=7836.000000
>>>> 106
>>>> 10c B13: # B9 B14 <- B12 Freq: 977.659
>>>> 10c addl RDI, #4 # int
>>>> 10f cmpl RDI, RBP
>>>> 111 jl,s B9 # loop end P=0.998980 C=7836.000000
>>>> 
>>>> But the intermediate increment of the induction variable:
>>>> 0c7 incl RDX # int
>>>> 0de addl RDX, #2 # int
>>>> 0f6 addl RDX, #3 # int
>>>> 
>>>> should be folded in the address computation of the memory accesses: ConvI2L(AddI(x, y)) should be converted to AddL(ConvI2L(x), ConvI2L(y)) but there’s a CastII from the checkIndex between the AddI and the ConvI2L so we first need to push the CastII through the AddI. That’s the first CastIINode::Ideal transformation. If we apply that transformation we then have several CastII that only differ by their type so we need the second transformation of CastIINode::Ideal so all of them fold after loop opts.
>>>> 
>>>>         for (; wi < length >> valuesPerWidth; wi++) {
>>>>             long bi = ((long) wi) << LOG2_ARRAY_LONG_INDEX_SCALE;
>>>>             long av = U.getLongUnaligned(a, aOffset + bi);
>>>>             long bv = U.getLongUnaligned(b, bOffset + bi);
>>>>             if (av != bv) {
>>>> 
>>>> 0b0 B7: # B32 B8 <- B6 B15 Loop: B7-B15 inner main of N123 Freq: 975.843
>>>> 0b0 movslq R8, RSI # i2l
>>>> 0b3 movq RAX, [RDX + #16 + R8 << #3] # long
>>>> 0b8 movq RDI, [RBP + #16 + R8 << #3] # long
>>>> 0bd cmpq RAX, RDI
>>>> 0c0 jne B32 P=0.000000 C=7836.000000
>>>> 0c0
>>>> 0c6 B8: # B33 B9 <- B7 Freq: 975.842
>>>> 0c6 movl R8, RSI # spill
>>>> 0c9 incl R8 # int
>>>> 0cc movslq RDI, R8 # i2l
>>>> 0cf movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 0d4 movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 0d9 cmpq RAX, RDI
>>>> 0dc jne B33 P=0.000000 C=7836.000000
>>>> 0dc
>>>> 0e2 B9: # B33 B10 <- B8 Freq: 975.842
>>>> 0e2 movl R8, RSI # spill
>>>> 0e5 addl R8, #2 # int
>>>> 0e9 movslq RDI, R8 # i2l
>>>> 0ec movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 0f1 movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 0f6 cmpq RAX, RDI
>>>> 0f9 jne B33 P=0.000000 C=7836.000000
>>>> 0f9
>>>> 0ff B10: # B33 B11 <- B9 Freq: 975.842
>>>> 0ff movl R8, RSI # spill
>>>> 102 addl R8, #3 # int
>>>> 106 movslq RDI, R8 # i2l
>>>> 109 movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 10e movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 113 cmpq RAX, RDI
>>>> 116 jne B33 P=0.000000 C=7836.000000
>>>> 116
>>>> 11c B11: # B33 B12 <- B10 Freq: 975.841
>>>> 11c movl R8, RSI # spill
>>>> 11f addl R8, #4 # int
>>>> 123 movslq RDI, R8 # i2l
>>>> 126 movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 12b movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 130 cmpq RAX, RDI
>>>> 133 jne B33 P=0.000000 C=7836.000000
>>>> 133
>>>> 139 B12: # B33 B13 <- B11 Freq: 975.841
>>>> 139 movl R8, RSI # spill
>>>> 13c addl R8, #5 # int
>>>> 140 movslq RDI, R8 # i2l
>>>> 143 movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 148 movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 14d cmpq RAX, RDI
>>>> 150 jne B33 P=0.000000 C=7836.000000
>>>> 150
>>>> 156 B13: # B33 B14 <- B12 Freq: 975.84
>>>> 156 movl R8, RSI # spill
>>>> 159 addl R8, #6 # int
>>>> 15d movslq RDI, R8 # i2l
>>>> 160 movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 165 movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 16a cmpq RAX, RDI
>>>> 16d jne B33 P=0.000000 C=7836.000000
>>>> 16d
>>>> 173 B14: # B33 B15 <- B13 Freq: 975.84
>>>> 173 movl R8, RSI # spill
>>>> 176 addl R8, #7 # int
>>>> 17a movslq RDI, R8 # i2l
>>>> 17d movq RAX, [RDX + #16 + RDI << #3] # long
>>>> 182 movq RDI, [RBP + #16 + RDI << #3] # long
>>>> 187 cmpq RAX, RDI
>>>> 18a jne B33 P=0.000000 C=7836.000000
>>>> 18a
>>>> 190 B15: # B7 B16 <- B14 Freq: 975.839
>>>> 190 addl RSI, #8 # int
>>>> 193 cmpl RSI, R11
>>>> 196 jl B7 # loop end P=0.998980 C=7836.000000
>>>> 
>>>> Same as above the intermediate increment of the induction variable should fold into the address computation but ConvI2L(AddI(x, y)) -> AddL(ConvI2L(x), ConvI2L(y)) is not applied because the compiler loses track of the bounds of the induction variable. The i2l conversions should also fold into the address computations but they don’t for the same reason. The change in loopnode.cpp tries to work around the problem by capturing the bounds of the loop as soon the CountedLoop is created and before other transformations applied to the loop makes it much harder for the compiler to figure the bounds out. I also relaxed the Phi type computation in PhiNode::Value().
>>>> 
>>>> I hit a couple unrelated bugs during testing: the fix in x86_64.ad is obvious. The change to superword is because we sometimes end up there with an AddL while, as I understand, we only expect integer nodes. Using the AddL leads to broken graphs.
>>>> 
>>>> Roland.
>>>> 
>>