A hotspot patch for stack profiling (frame pointer)
Vladimir Kozlov
vladimir.kozlov at oracle.com
Wed Jan 14 02:06:53 UTC 2015
Filed RFE:
https://bugs.openjdk.java.net/browse/JDK-8068945
Regards,
Vladimir
On 12/9/14 2:14 AM, Erik Helin wrote:
> I should also add that I don't have enough knowledge of the compiler
> internals to review this patch, sorry.
>
> Thanks,
> Erik
>
> On 2014-12-09 10:53, Erik Helin wrote:
>> I applied the patch on top of jdk9/hs-comp and created a webrev:
>> http://cr.openjdk.java.net/~ehelin/brendan/frame-pointer/webrev/
>>
>> I also successfully run the patch through JPRT.
>>
>> Thanks,
>> Erik
>>
>> On 2014-12-05 20:57, Brendan Gregg wrote:
>>>
>>>
>>> On Thu, Dec 4, 2014 at 2:55 PM, Brendan Gregg <brendan.d.gregg at gmail.com
>>> <mailto:brendan.d.gregg at gmail.com>> wrote:
>>>
>>> G'Day,
>>>
>>> I've hacked hotspot to return the frame pointer, in part to see what
>>> this involves, and also to have a working prototype for analysis.
>>> Along with an agent to resolve symbols, this has allowed full stack
>>> profiling using Linux perf_events. The following flame graphs show
>>> the resulting profiles.
>>>
>>> A mixed mode CPU flame graph of a vert.x benchmark (click to zoom):
>>>
>>> http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-vertx.svg
>>>
>>> Same thing, but this time disabling inlining, to show more frames:
>>>
>>> http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-flamegraph.svg
>>>
>>> As expected, performance is worse without inlining. You can compare
>>> the flame graphs side by side to see why. Less time spent doing work
>>> / I/O!
>>>
>>>
>>> https://github.com/brendangregg/Misc/blob/master/java/openjdk8_b132-fp.diff
>>>
>>>
>>> is my patch,
>>>
>>> [...]
>>>
>>>
>>> In case there's problems with the patch URL, the patch is:
>>>
>>> --- openjdk8clean/hotspot/src/cpu/x86/vm/x86_64.ad <http://x86_64.ad>
>>> 2014-03-04 02:52:11.000000000 +0000
>>> +++ openjdk8/hotspot/src/cpu/x86/vm/x86_64.ad <http://x86_64.ad>
>>> 2014-11-08 01:10:49.686044933 +0000
>>> @@ -166,10 +166,9 @@
>>> // 3) reg_class stack_slots( /* one chunk of stack-based "registers"
>>> */ )
>>> //
>>>
>>> -// Class for all pointer registers (including RSP)
>>> +// Class for all pointer registers (including RSP, excluding RBP)
>>> reg_class any_reg(RAX, RAX_H,
>>> RDX, RDX_H,
>>> - RBP, RBP_H,
>>> RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> @@ -184,10 +183,9 @@
>>> R14, R14_H,
>>> R15, R15_H);
>>>
>>> -// Class for all pointer registers except RSP
>>> +// Class for all pointer registers except RSP and RBP
>>> reg_class ptr_reg(RAX, RAX_H,
>>> RDX, RDX_H,
>>> - RBP, RBP_H,
>>> RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> @@ -199,9 +197,8 @@
>>> R13, R13_H,
>>> R14, R14_H);
>>>
>>> -// Class for all pointer registers except RAX and RSP
>>> +// Class for all pointer registers except RAX, RSP and RBP
>>> reg_class ptr_no_rax_reg(RDX, RDX_H,
>>> - RBP, RBP_H,
>>> RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> @@ -226,9 +223,8 @@
>>> R13, R13_H,
>>> R14, R14_H);
>>>
>>> -// Class for all pointer registers except RAX, RBX and RSP
>>> +// Class for all pointer registers except RAX, RBX, RSP and RBP
>>> reg_class ptr_no_rax_rbx_reg(RDX, RDX_H,
>>> - RBP, RBP_H,
>>> RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> @@ -260,10 +256,9 @@
>>> // Singleton class for TLS pointer
>>> reg_class ptr_r15_reg(R15, R15_H);
>>>
>>> -// Class for all long registers (except RSP)
>>> +// Class for all long registers (except RSP and RBP)
>>> reg_class long_reg(RAX, RAX_H,
>>> RDX, RDX_H,
>>> - RBP, RBP_H,
>>> RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> @@ -275,9 +270,8 @@
>>> R13, R13_H,
>>> R14, R14_H);
>>>
>>> -// Class for all long registers except RAX, RDX (and RSP)
>>> -reg_class long_no_rax_rdx_reg(RBP, RBP_H,
>>> - RDI, RDI_H,
>>> +// Class for all long registers except RAX, RDX (and RSP, RBP)
>>> +reg_class long_no_rax_rdx_reg(RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> RBX, RBX_H,
>>> @@ -288,9 +282,8 @@
>>> R13, R13_H,
>>> R14, R14_H);
>>>
>>> -// Class for all long registers except RCX (and RSP)
>>> -reg_class long_no_rcx_reg(RBP, RBP_H,
>>> - RDI, RDI_H,
>>> +// Class for all long registers except RCX (and RSP, RBP)
>>> +reg_class long_no_rcx_reg(RDI, RDI_H,
>>> RSI, RSI_H,
>>> RAX, RAX_H,
>>> RDX, RDX_H,
>>> @@ -302,9 +295,8 @@
>>> R13, R13_H,
>>> R14, R14_H);
>>>
>>> -// Class for all long registers except RAX (and RSP)
>>> -reg_class long_no_rax_reg(RBP, RBP_H,
>>> - RDX, RDX_H,
>>> +// Class for all long registers except RAX (and RSP, RBP)
>>> +reg_class long_no_rax_reg(RDX, RDX_H,
>>> RDI, RDI_H,
>>> RSI, RSI_H,
>>> RCX, RCX_H,
>>> @@ -325,10 +317,9 @@
>>> // Singleton class for RDX long register
>>> reg_class long_rdx_reg(RDX, RDX_H);
>>>
>>> -// Class for all int registers (except RSP)
>>> +// Class for all int registers (except RSP and RBP)
>>> reg_class int_reg(RAX,
>>> RDX,
>>> - RBP,
>>> RDI,
>>> RSI,
>>> RCX,
>>> @@ -340,10 +331,9 @@
>>> R13,
>>> R14);
>>>
>>> -// Class for all int registers except RCX (and RSP)
>>> +// Class for all int registers except RCX (and RSP, RBP)
>>> reg_class int_no_rcx_reg(RAX,
>>> RDX,
>>> - RBP,
>>> RDI,
>>> RSI,
>>> RBX,
>>> @@ -355,8 +345,7 @@
>>> R14);
>>>
>>> // Class for all int registers except RAX, RDX (and RSP)
>>> -reg_class int_no_rax_rdx_reg(RBP,
>>> - RDI,
>>> +reg_class int_no_rax_rdx_reg(RDI,
>>> RSI,
>>> RCX,
>>> RBX,
>>> @@ -718,6 +707,7 @@
>>> st->print("# stack bang");
>>> st->print("\n\t");
>>> st->print("pushq rbp\t# Save rbp");
>>> + // BDG consider: st->print("movq rbp, rsp\t# ");
>>> if (framesize) {
>>> st->print("\n\t");
>>> st->print("subq rsp, #%d\t# Create frame",framesize);
>>> --- openjdk8clean/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
>>> 2014-03-04 02:52:11.000000000 +0000
>>> +++ openjdk8/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-11-07
>>> 23:57:11.589593723 +0000
>>> @@ -5236,6 +5236,7 @@
>>> // We always push rbp, so that on return to interpreter rbp,
>>> will be
>>> // restored correctly and we can correct the stack.
>>> push(rbp);
>>> + mov(rbp, rsp);
>>> // Remove word for ebp
>>> framesize -= wordSize;
>>>
>>> --- openjdk8clean/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
>>> 2014-03-04 02:52:10.000000000 +0000
>>> +++ openjdk8/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
>>> 2014-11-07 23:57:21.933257882 +0000
>>> @@ -358,6 +358,7 @@
>>> generate_stack_overflow_check(frame_size_in_bytes);
>>>
>>> push(rbp);
>>> + mov(rbp, rsp);
>>> #ifdef TIERED
>>> // c2 leaves fpu stack dirty. Clean it on entry
>>> if (UseSSE < 2 ) {
>>>
>>>
>>> Brendan
>
More information about the hotspot-compiler-dev
mailing list