A hotspot patch for stack profiling (frame pointer)

Vladimir Kozlov vladimir.kozlov at oracle.com
Wed Jan 14 02:06:53 UTC 2015


Filed RFE:

https://bugs.openjdk.java.net/browse/JDK-8068945

Regards,
Vladimir

On 12/9/14 2:14 AM, Erik Helin wrote:
> I should also add that I don't have enough knowledge of the compiler
> internals to review this patch, sorry.
>
> Thanks,
> Erik
>
> On 2014-12-09 10:53, Erik Helin wrote:
>> I applied the patch on top of jdk9/hs-comp and created a webrev:
>> http://cr.openjdk.java.net/~ehelin/brendan/frame-pointer/webrev/
>>
>> I also successfully run the patch through JPRT.
>>
>> Thanks,
>> Erik
>>
>> On 2014-12-05 20:57, Brendan Gregg wrote:
>>>
>>>
>>> On Thu, Dec 4, 2014 at 2:55 PM, Brendan Gregg <brendan.d.gregg at gmail.com
>>> <mailto:brendan.d.gregg at gmail.com>> wrote:
>>>
>>>     G'Day,
>>>
>>>     I've hacked hotspot to return the frame pointer, in part to see what
>>>     this involves, and also to have a working prototype for analysis.
>>>     Along with an agent to resolve symbols, this has allowed full stack
>>>     profiling using Linux perf_events. The following flame graphs show
>>>     the resulting profiles.
>>>
>>>     A mixed mode CPU flame graph of a vert.x benchmark (click to zoom):
>>>
>>>     http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-vertx.svg
>>>
>>>     Same thing, but this time disabling inlining, to show more frames:
>>>
>>>     http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-flamegraph.svg
>>>
>>>     As expected, performance is worse without inlining. You can compare
>>>     the flame graphs side by side to see why. Less time spent doing work
>>>     / I/O!
>>>
>>>
>>> https://github.com/brendangregg/Misc/blob/master/java/openjdk8_b132-fp.diff
>>>
>>>
>>>     is my patch,
>>>
>>>     [...]
>>>
>>>
>>> In case there's problems with the patch URL, the patch is:
>>>
>>> --- openjdk8clean/hotspot/src/cpu/x86/vm/x86_64.ad <http://x86_64.ad>
>>>   2014-03-04 02:52:11.000000000 +0000
>>> +++ openjdk8/hotspot/src/cpu/x86/vm/x86_64.ad <http://x86_64.ad>
>>>   2014-11-08 01:10:49.686044933 +0000
>>> @@ -166,10 +166,9 @@
>>>   // 3) reg_class stack_slots( /* one chunk of stack-based "registers"
>>> */ )
>>>   //
>>>
>>> -// Class for all pointer registers (including RSP)
>>> +// Class for all pointer registers (including RSP, excluding RBP)
>>>   reg_class any_reg(RAX, RAX_H,
>>>                     RDX, RDX_H,
>>> -                  RBP, RBP_H,
>>>                     RDI, RDI_H,
>>>                     RSI, RSI_H,
>>>                     RCX, RCX_H,
>>> @@ -184,10 +183,9 @@
>>>                     R14, R14_H,
>>>                     R15, R15_H);
>>>
>>> -// Class for all pointer registers except RSP
>>> +// Class for all pointer registers except RSP and RBP
>>>   reg_class ptr_reg(RAX, RAX_H,
>>>                     RDX, RDX_H,
>>> -                  RBP, RBP_H,
>>>                     RDI, RDI_H,
>>>                     RSI, RSI_H,
>>>                     RCX, RCX_H,
>>> @@ -199,9 +197,8 @@
>>>                     R13, R13_H,
>>>                     R14, R14_H);
>>>
>>> -// Class for all pointer registers except RAX and RSP
>>> +// Class for all pointer registers except RAX, RSP and RBP
>>>   reg_class ptr_no_rax_reg(RDX, RDX_H,
>>> -                         RBP, RBP_H,
>>>                            RDI, RDI_H,
>>>                            RSI, RSI_H,
>>>                            RCX, RCX_H,
>>> @@ -226,9 +223,8 @@
>>>                            R13, R13_H,
>>>                            R14, R14_H);
>>>
>>> -// Class for all pointer registers except RAX, RBX and RSP
>>> +// Class for all pointer registers except RAX, RBX, RSP and RBP
>>>   reg_class ptr_no_rax_rbx_reg(RDX, RDX_H,
>>> -                             RBP, RBP_H,
>>>                                RDI, RDI_H,
>>>                                RSI, RSI_H,
>>>                                RCX, RCX_H,
>>> @@ -260,10 +256,9 @@
>>>   // Singleton class for TLS pointer
>>>   reg_class ptr_r15_reg(R15, R15_H);
>>>
>>> -// Class for all long registers (except RSP)
>>> +// Class for all long registers (except RSP and RBP)
>>>   reg_class long_reg(RAX, RAX_H,
>>>                      RDX, RDX_H,
>>> -                   RBP, RBP_H,
>>>                      RDI, RDI_H,
>>>                      RSI, RSI_H,
>>>                      RCX, RCX_H,
>>> @@ -275,9 +270,8 @@
>>>                      R13, R13_H,
>>>                      R14, R14_H);
>>>
>>> -// Class for all long registers except RAX, RDX (and RSP)
>>> -reg_class long_no_rax_rdx_reg(RBP, RBP_H,
>>> -                              RDI, RDI_H,
>>> +// Class for all long registers except RAX, RDX (and RSP, RBP)
>>> +reg_class long_no_rax_rdx_reg(RDI, RDI_H,
>>>                                 RSI, RSI_H,
>>>                                 RCX, RCX_H,
>>>                                 RBX, RBX_H,
>>> @@ -288,9 +282,8 @@
>>>                                 R13, R13_H,
>>>                                 R14, R14_H);
>>>
>>> -// Class for all long registers except RCX (and RSP)
>>> -reg_class long_no_rcx_reg(RBP, RBP_H,
>>> -                          RDI, RDI_H,
>>> +// Class for all long registers except RCX (and RSP, RBP)
>>> +reg_class long_no_rcx_reg(RDI, RDI_H,
>>>                             RSI, RSI_H,
>>>                             RAX, RAX_H,
>>>                             RDX, RDX_H,
>>> @@ -302,9 +295,8 @@
>>>                             R13, R13_H,
>>>                             R14, R14_H);
>>>
>>> -// Class for all long registers except RAX (and RSP)
>>> -reg_class long_no_rax_reg(RBP, RBP_H,
>>> -                          RDX, RDX_H,
>>> +// Class for all long registers except RAX (and RSP, RBP)
>>> +reg_class long_no_rax_reg(RDX, RDX_H,
>>>                             RDI, RDI_H,
>>>                             RSI, RSI_H,
>>>                             RCX, RCX_H,
>>> @@ -325,10 +317,9 @@
>>>   // Singleton class for RDX long register
>>>   reg_class long_rdx_reg(RDX, RDX_H);
>>>
>>> -// Class for all int registers (except RSP)
>>> +// Class for all int registers (except RSP and RBP)
>>>   reg_class int_reg(RAX,
>>>                     RDX,
>>> -                  RBP,
>>>                     RDI,
>>>                     RSI,
>>>                     RCX,
>>> @@ -340,10 +331,9 @@
>>>                     R13,
>>>                     R14);
>>>
>>> -// Class for all int registers except RCX (and RSP)
>>> +// Class for all int registers except RCX (and RSP, RBP)
>>>   reg_class int_no_rcx_reg(RAX,
>>>                            RDX,
>>> -                         RBP,
>>>                            RDI,
>>>                            RSI,
>>>                            RBX,
>>> @@ -355,8 +345,7 @@
>>>                            R14);
>>>
>>>   // Class for all int registers except RAX, RDX (and RSP)
>>> -reg_class int_no_rax_rdx_reg(RBP,
>>> -                             RDI,
>>> +reg_class int_no_rax_rdx_reg(RDI,
>>>                                RSI,
>>>                                RCX,
>>>                                RBX,
>>> @@ -718,6 +707,7 @@
>>>       st->print("# stack bang");
>>>       st->print("\n\t");
>>>       st->print("pushq   rbp\t# Save rbp");
>>> +    // BDG consider: st->print("movq    rbp, rsp\t# ");
>>>       if (framesize) {
>>>         st->print("\n\t");
>>>         st->print("subq    rsp, #%d\t# Create frame",framesize);
>>> --- openjdk8clean/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
>>>   2014-03-04 02:52:11.000000000 +0000
>>> +++ openjdk8/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp    2014-11-07
>>> 23:57:11.589593723 +0000
>>> @@ -5236,6 +5236,7 @@
>>>       // We always push rbp, so that on return to interpreter rbp,
>>> will be
>>>       // restored correctly and we can correct the stack.
>>>       push(rbp);
>>> +    mov(rbp, rsp);
>>>       // Remove word for ebp
>>>       framesize -= wordSize;
>>>
>>> --- openjdk8clean/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
>>>   2014-03-04 02:52:10.000000000 +0000
>>> +++ openjdk8/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
>>>   2014-11-07 23:57:21.933257882 +0000
>>> @@ -358,6 +358,7 @@
>>>     generate_stack_overflow_check(frame_size_in_bytes);
>>>
>>>     push(rbp);
>>> +  mov(rbp, rsp);
>>>   #ifdef TIERED
>>>     // c2 leaves fpu stack dirty. Clean it on entry
>>>     if (UseSSE < 2 ) {
>>>
>>>
>>> Brendan
>


More information about the hotspot-compiler-dev mailing list