LRB and 32-bit compressed oops
Roman Kennke
rkennke at redhat.com
Mon Mar 25 20:35:44 UTC 2019
I believe this might just be a consequence of how DecodeN is expanded in
.ad:
instruct decodeHeapOop(rRegP dst, rRegN src, rFlagsReg cr) %{
predicate(n->bottom_type()->is_ptr()->ptr() != TypePtr::NotNull &&
n->bottom_type()->is_ptr()->ptr() != TypePtr::Constant);
match(Set dst (DecodeN src));
effect(KILL cr);
format %{ "decode_heap_oop $dst,$src" %}
ins_encode %{
Register s = $src$$Register;
Register d = $dst$$Register;
if (s != d) {
__ movq(d, s);
}
__ decode_heap_oop(d);
%}
ins_pipe(ialu_reg_long);
%}
You see it takes two registers dst and src (which may or may not be the
same, up to regalloc), and then :
if (s != d) {
__ movq(d, s);
}
Well.
I know that C2 can handle instructions like:
match(Set src (DecodeN src));
I have done this in my nofwdptr patch. However, I suspect this may not
be possible here because of different reg types.
Roland my know more? (Looping in...)
Roman
> Hi again,
>
> I was following up on experiments with LRB vs non-LRB, and spotted the thing about 32-bit compressed
> oops.
>
> Run the gc-bench test that writes a single int:
> https://icedtea.classpath.org/hg/gc-bench/
>
> $ ~/trunks/shenandoah-jdk/build/linux-x86_64-server-release/images/jdk/bin/java -jar
> target/benchmarks.jar -jvmArgs "-XX:+UnlockExperimentalVMOptions -XX:+UseShenandoahGC"
> writes.Plain.test_int -prof perfasm:printMargin=30 2>&1 | tee lrb.perfasm
>
> Run with -Xmx20g, thus enabling compressed oops, you shall see this:
>
> [Verified Entry Point]
> 6.94% 0x00007f60c0497050: mov %eax,-0x14000(%rsp)
> 5.80% 0x00007f60c0497057: push %rbp
> 0.30% 0x00007f60c0497058: sub $0x10,%rsp
> 11.81% 0x00007f60c049705c: mov 0xc(%rsi),%r11d
> 0.82% 0x00007f60c0497060: mov %r11,%r9
> 0.48% 0x00007f60c0497063: shl $0x3,%r9
> .......................... LRB fastpath check ..........................
> 5.29% 0x00007f60c0497067: testb $0x1,0x20(%r15)
> 5.49% ╭ 0x00007f60c049706c: jne 0x00007f60c0497086
> .........│......... LRB fastpath ends, store to %r9 follows ............
> 0.87% │↗ ↗↗ 0x00007f60c049706e: movl $0x2a,0xc(%r9)
> 7.59% ││ ││ 0x00007f60c0497076: add $0x10,%rsp
> 6.12% ││ ││ 0x00007f60c049707a: pop %rbp
> 1.01% ││ ││ 0x00007f60c049707b: mov 0x108(%r15),%r10
> 0.63% ││ ││ 0x00007f60c0497082: test %eax,(%r10)
> 6.73% ││ ││ 0x00007f60c0497085: retq
> ---------││-││----------- LRB midpath starts --------------------------
> .........│|.|│............ checking in-cset ...........................
> ↘│ ││ 0x00007f60c0497086: mov %r9,%r10
> │ ││ 0x00007f60c0497089: shr $0x17,%r10
> │ ││ 0x00007f60c049708d: movabs $0x7f60d00919f0,%r8
> │ ││ 0x00007f60c0497097: cmpb $0x0,(%r8,%r10,1)
> ╰ ││ 0x00007f60c049709c: je 0x00007f60c049706e
> ............││............ checking is-forwarded ......................
> ││ 0x00007f60c049709e: mov -0x8(%r12,%r11,8),%r9
> ││ 0x00007f60c04970a3: lea (%r12,%r11,8),%r10
> ││ 0x00007f60c04970a7: cmp %r10,%r9
> ╰│ 0x00007f60c04970aa: jne 0x00007f60c049706e
> .............│............... slow path call ..........................
> │ 0x00007f60c04970ac: mov %r9,%rdi
> │ 0x00007f60c04970af: movabs $0x7f60d7775030,%r10
> │ 0x00007f60c04970b9: callq *%r10
> │ 0x00007f60c04970bc: mov %rax,%r9
> ╰ 0x00007f60c04970bf: jmp 0x00007f60c049706e
>
> This is actually good code. But if you add -Xmx1g, thus enabling 32-bit compressed oops, you would
> expect decode to go away in favor of just using the (extended) 32-bit value. Shifts are indeed gone,
> but register moves are still there. And that, I think, wastes registers, see:
>
> [Verified Entry Point]
> 6.85% 0x00007fb1284982d0: mov %eax,-0x14000(%rsp)
> 5.71% 0x00007fb1284982d7: push %rbp
> 3.14% 0x00007fb1284982d8: sub $0x10,%rsp
> 7.32% 0x00007fb1284982dc: mov 0xc(%rsi),%r11d
> 2.46% 0x00007fb1284982e0: mov %r11,%r9 <---- !!!!
> .......................... LRB fastpath check ..........................
> 2.97% 0x00007fb1284982e3: testb $0x1,0x20(%r15)
> 3.45% ╭ 0x00007fb1284982e8: jne 0x00007fb128498302
> .........│......... LRB fastpath ends, store to %r9 follows ............
> 3.51% │↗ ↗↗ 0x00007fb1284982ea: movl $0x2a,0xc(%r9)
> 7.30% ││ ││ 0x00007fb1284982f2: add $0x10,%rsp
> 3.12% ││ ││ 0x00007fb1284982f6: pop %rbp
> 2.91% ││ ││ 0x00007fb1284982f7: mov 0x108(%r15),%r10
> 3.23% ││ ││ 0x00007fb1284982fe: test %eax,(%r10)
> 4.63% ││ ││ 0x00007fb128498301: retq
> ---------││-││----------- LRB midpath starts --------------------------
> .........│|.|│............ checking in-cset ...........................
> ↘│ ││ 0x00007fb128498302: mov %r9,%r10
> │ ││ 0x00007fb128498305: shr $0x13,%r10
> │ ││ 0x00007fb128498309: movabs $0x7fb13808d770,%r8
> │ ││ 0x00007fb128498313: cmpb $0x0,(%r8,%r10,1)
> ╰ ││ 0x00007fb128498318: je 0x00007fb1284982ea
> ............││............ checking is-forwarded ......................
> ││ 0x00007fb12849831a: mov -0x8(%r11),%r9
> ││ 0x00007fb12849831e: mov %r11,%r10 <---- !!!!
> ││ 0x00007fb128498321: cmp %r10,%r9
> ╰│ 0x00007fb128498324: jne 0x00007fb1284982ea
> .............│............... slow path call ..........................
> │ 0x00007fb128498326: mov %r9,%rdi
> │ 0x00007fb128498329: movabs $0x7fb13f963030,%r10
> │ 0x00007fb128498333: callq *%r10
> │ 0x00007fb128498336: mov %rax,%r9
> ╰ 0x00007fb128498339: jmp 0x00007fb1284982ea
>
> 32-bit compressed oops mode is interesting, because it is the microservice range. Not sure it is LRB
> problem, or a generic C2 one.
>
> Thanks,
> -Aleksey
>
>
More information about the shenandoah-dev
mailing list