LRB and 32-bit compressed oops

Mon Mar 25 20:35:44 UTC 2019

I believe this might just be a consequence of how DecodeN is expanded in
.ad:

instruct decodeHeapOop(rRegP dst, rRegN src, rFlagsReg cr) %{
  predicate(n->bottom_type()->is_ptr()->ptr() != TypePtr::NotNull &&
            n->bottom_type()->is_ptr()->ptr() != TypePtr::Constant);
  match(Set dst (DecodeN src));
  effect(KILL cr);
  format %{ "decode_heap_oop $dst,$src" %}
  ins_encode %{
    Register s = $src$$Register;
    Register d = $dst$$Register;
    if (s != d) {
      __ movq(d, s);
    }
    __ decode_heap_oop(d);
  %}
  ins_pipe(ialu_reg_long);
%}

You see it takes two registers dst and src (which may or may not be the
same, up to regalloc), and then :

    if (s != d) {
      __ movq(d, s);
    }

Well.

I know that C2 can handle instructions like:

  match(Set src (DecodeN src));

I have done this in my nofwdptr patch. However, I suspect this may not
be possible here because of different reg types.

Roland my know more? (Looping in...)

Roman

> Hi again,
> 
> I was following up on experiments with LRB vs non-LRB, and spotted the thing about 32-bit compressed
> oops.
> 
> Run the gc-bench test that writes a single int:
>  https://icedtea.classpath.org/hg/gc-bench/
> 
> $ ~/trunks/shenandoah-jdk/build/linux-x86_64-server-release/images/jdk/bin/java -jar
> target/benchmarks.jar -jvmArgs "-XX:+UnlockExperimentalVMOptions -XX:+UseShenandoahGC"
> writes.Plain.test_int -prof perfasm:printMargin=30 2>&1 | tee lrb.perfasm
> 
> Run with -Xmx20g, thus enabling compressed oops, you shall see this:
> 
>               [Verified Entry Point]
>   6.94%         0x00007f60c0497050: mov    %eax,-0x14000(%rsp)
>   5.80%         0x00007f60c0497057: push   %rbp
>   0.30%         0x00007f60c0497058: sub    $0x10,%rsp
>  11.81%         0x00007f60c049705c: mov    0xc(%rsi),%r11d
>   0.82%         0x00007f60c0497060: mov    %r11,%r9
>   0.48%         0x00007f60c0497063: shl    $0x3,%r9
> .......................... LRB fastpath check ..........................
>   5.29%         0x00007f60c0497067: testb  $0x1,0x20(%r15)
>   5.49%  ╭      0x00007f60c049706c: jne    0x00007f60c0497086
> .........│......... LRB fastpath ends, store to %r9 follows ............
>   0.87%  │↗ ↗↗  0x00007f60c049706e: movl   $0x2a,0xc(%r9)
>   7.59%  ││ ││  0x00007f60c0497076: add    $0x10,%rsp
>   6.12%  ││ ││  0x00007f60c049707a: pop    %rbp
>   1.01%  ││ ││  0x00007f60c049707b: mov    0x108(%r15),%r10
>   0.63%  ││ ││  0x00007f60c0497082: test   %eax,(%r10)
>   6.73%  ││ ││  0x00007f60c0497085: retq
> ---------││-││----------- LRB midpath starts --------------------------
> .........│|.|│............ checking in-cset ...........................
>          ↘│ ││  0x00007f60c0497086: mov    %r9,%r10
>           │ ││  0x00007f60c0497089: shr    $0x17,%r10
>           │ ││  0x00007f60c049708d: movabs $0x7f60d00919f0,%r8
>           │ ││  0x00007f60c0497097: cmpb   $0x0,(%r8,%r10,1)
>           ╰ ││  0x00007f60c049709c: je     0x00007f60c049706e
> ............││............ checking is-forwarded ......................
>             ││  0x00007f60c049709e: mov    -0x8(%r12,%r11,8),%r9
>             ││  0x00007f60c04970a3: lea    (%r12,%r11,8),%r10
>             ││  0x00007f60c04970a7: cmp    %r10,%r9
>             ╰│  0x00007f60c04970aa: jne    0x00007f60c049706e
> .............│............... slow path call ..........................
>              │  0x00007f60c04970ac: mov    %r9,%rdi
>              │  0x00007f60c04970af: movabs $0x7f60d7775030,%r10
>              │  0x00007f60c04970b9: callq  *%r10
>              │  0x00007f60c04970bc: mov    %rax,%r9
>              ╰  0x00007f60c04970bf: jmp    0x00007f60c049706e
> 
> This is actually good code. But if you add -Xmx1g, thus enabling 32-bit compressed oops, you would
> expect decode to go away in favor of just using the (extended) 32-bit value. Shifts are indeed gone,
> but register moves are still there. And that, I think, wastes registers, see:
> 
>               [Verified Entry Point]
>   6.85%         0x00007fb1284982d0: mov    %eax,-0x14000(%rsp)
>   5.71%         0x00007fb1284982d7: push   %rbp
>   3.14%         0x00007fb1284982d8: sub    $0x10,%rsp
>   7.32%         0x00007fb1284982dc: mov    0xc(%rsi),%r11d
>   2.46%         0x00007fb1284982e0: mov    %r11,%r9           <---- !!!!
> .......................... LRB fastpath check ..........................
>   2.97%         0x00007fb1284982e3: testb  $0x1,0x20(%r15)
>   3.45%  ╭      0x00007fb1284982e8: jne    0x00007fb128498302
> .........│......... LRB fastpath ends, store to %r9 follows ............
>   3.51%  │↗ ↗↗  0x00007fb1284982ea: movl   $0x2a,0xc(%r9)
>   7.30%  ││ ││  0x00007fb1284982f2: add    $0x10,%rsp
>   3.12%  ││ ││  0x00007fb1284982f6: pop    %rbp
>   2.91%  ││ ││  0x00007fb1284982f7: mov    0x108(%r15),%r10
>   3.23%  ││ ││  0x00007fb1284982fe: test   %eax,(%r10)
>   4.63%  ││ ││  0x00007fb128498301: retq
> ---------││-││----------- LRB midpath starts --------------------------
> .........│|.|│............ checking in-cset ...........................
>          ↘│ ││  0x00007fb128498302: mov    %r9,%r10
>           │ ││  0x00007fb128498305: shr    $0x13,%r10
>           │ ││  0x00007fb128498309: movabs $0x7fb13808d770,%r8
>           │ ││  0x00007fb128498313: cmpb   $0x0,(%r8,%r10,1)
>           ╰ ││  0x00007fb128498318: je     0x00007fb1284982ea
> ............││............ checking is-forwarded ......................
>             ││  0x00007fb12849831a: mov    -0x8(%r11),%r9
>             ││  0x00007fb12849831e: mov    %r11,%r10         <---- !!!!
>             ││  0x00007fb128498321: cmp    %r10,%r9
>             ╰│  0x00007fb128498324: jne    0x00007fb1284982ea
> .............│............... slow path call ..........................
>              │  0x00007fb128498326: mov    %r9,%rdi
>              │  0x00007fb128498329: movabs $0x7fb13f963030,%r10
>              │  0x00007fb128498333: callq  *%r10
>              │  0x00007fb128498336: mov    %rax,%r9
>              ╰  0x00007fb128498339: jmp    0x00007fb1284982ea
> 
> 32-bit compressed oops mode is interesting, because it is the microservice range. Not sure it is LRB
> problem, or a generic C2 one.
> 
> Thanks,
> -Aleksey
> 
>