A hotspot patch for stack profiling (frame pointer)
Erik Helin
erik.helin at oracle.com
Tue Dec 9 10:14:54 UTC 2014
I should also add that I don't have enough knowledge of the compiler
internals to review this patch, sorry.
Thanks,
Erik
On 2014-12-09 10:53, Erik Helin wrote:
> I applied the patch on top of jdk9/hs-comp and created a webrev:
> http://cr.openjdk.java.net/~ehelin/brendan/frame-pointer/webrev/
>
> I also successfully run the patch through JPRT.
>
> Thanks,
> Erik
>
> On 2014-12-05 20:57, Brendan Gregg wrote:
>>
>>
>> On Thu, Dec 4, 2014 at 2:55 PM, Brendan Gregg <brendan.d.gregg at gmail.com
>> <mailto:brendan.d.gregg at gmail.com>> wrote:
>>
>> G'Day,
>>
>> I've hacked hotspot to return the frame pointer, in part to see what
>> this involves, and also to have a working prototype for analysis.
>> Along with an agent to resolve symbols, this has allowed full stack
>> profiling using Linux perf_events. The following flame graphs show
>> the resulting profiles.
>>
>> A mixed mode CPU flame graph of a vert.x benchmark (click to zoom):
>>
>> http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-vertx.svg
>>
>> Same thing, but this time disabling inlining, to show more frames:
>>
>> http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-flamegraph.svg
>>
>> As expected, performance is worse without inlining. You can compare
>> the flame graphs side by side to see why. Less time spent doing work
>> / I/O!
>>
>>
>> https://github.com/brendangregg/Misc/blob/master/java/openjdk8_b132-fp.diff
>>
>> is my patch,
>>
>> [...]
>>
>>
>> In case there's problems with the patch URL, the patch is:
>>
>> --- openjdk8clean/hotspot/src/cpu/x86/vm/x86_64.ad <http://x86_64.ad>
>> 2014-03-04 02:52:11.000000000 +0000
>> +++ openjdk8/hotspot/src/cpu/x86/vm/x86_64.ad <http://x86_64.ad>
>> 2014-11-08 01:10:49.686044933 +0000
>> @@ -166,10 +166,9 @@
>> // 3) reg_class stack_slots( /* one chunk of stack-based "registers"
>> */ )
>> //
>>
>> -// Class for all pointer registers (including RSP)
>> +// Class for all pointer registers (including RSP, excluding RBP)
>> reg_class any_reg(RAX, RAX_H,
>> RDX, RDX_H,
>> - RBP, RBP_H,
>> RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> @@ -184,10 +183,9 @@
>> R14, R14_H,
>> R15, R15_H);
>>
>> -// Class for all pointer registers except RSP
>> +// Class for all pointer registers except RSP and RBP
>> reg_class ptr_reg(RAX, RAX_H,
>> RDX, RDX_H,
>> - RBP, RBP_H,
>> RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> @@ -199,9 +197,8 @@
>> R13, R13_H,
>> R14, R14_H);
>>
>> -// Class for all pointer registers except RAX and RSP
>> +// Class for all pointer registers except RAX, RSP and RBP
>> reg_class ptr_no_rax_reg(RDX, RDX_H,
>> - RBP, RBP_H,
>> RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> @@ -226,9 +223,8 @@
>> R13, R13_H,
>> R14, R14_H);
>>
>> -// Class for all pointer registers except RAX, RBX and RSP
>> +// Class for all pointer registers except RAX, RBX, RSP and RBP
>> reg_class ptr_no_rax_rbx_reg(RDX, RDX_H,
>> - RBP, RBP_H,
>> RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> @@ -260,10 +256,9 @@
>> // Singleton class for TLS pointer
>> reg_class ptr_r15_reg(R15, R15_H);
>>
>> -// Class for all long registers (except RSP)
>> +// Class for all long registers (except RSP and RBP)
>> reg_class long_reg(RAX, RAX_H,
>> RDX, RDX_H,
>> - RBP, RBP_H,
>> RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> @@ -275,9 +270,8 @@
>> R13, R13_H,
>> R14, R14_H);
>>
>> -// Class for all long registers except RAX, RDX (and RSP)
>> -reg_class long_no_rax_rdx_reg(RBP, RBP_H,
>> - RDI, RDI_H,
>> +// Class for all long registers except RAX, RDX (and RSP, RBP)
>> +reg_class long_no_rax_rdx_reg(RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> RBX, RBX_H,
>> @@ -288,9 +282,8 @@
>> R13, R13_H,
>> R14, R14_H);
>>
>> -// Class for all long registers except RCX (and RSP)
>> -reg_class long_no_rcx_reg(RBP, RBP_H,
>> - RDI, RDI_H,
>> +// Class for all long registers except RCX (and RSP, RBP)
>> +reg_class long_no_rcx_reg(RDI, RDI_H,
>> RSI, RSI_H,
>> RAX, RAX_H,
>> RDX, RDX_H,
>> @@ -302,9 +295,8 @@
>> R13, R13_H,
>> R14, R14_H);
>>
>> -// Class for all long registers except RAX (and RSP)
>> -reg_class long_no_rax_reg(RBP, RBP_H,
>> - RDX, RDX_H,
>> +// Class for all long registers except RAX (and RSP, RBP)
>> +reg_class long_no_rax_reg(RDX, RDX_H,
>> RDI, RDI_H,
>> RSI, RSI_H,
>> RCX, RCX_H,
>> @@ -325,10 +317,9 @@
>> // Singleton class for RDX long register
>> reg_class long_rdx_reg(RDX, RDX_H);
>>
>> -// Class for all int registers (except RSP)
>> +// Class for all int registers (except RSP and RBP)
>> reg_class int_reg(RAX,
>> RDX,
>> - RBP,
>> RDI,
>> RSI,
>> RCX,
>> @@ -340,10 +331,9 @@
>> R13,
>> R14);
>>
>> -// Class for all int registers except RCX (and RSP)
>> +// Class for all int registers except RCX (and RSP, RBP)
>> reg_class int_no_rcx_reg(RAX,
>> RDX,
>> - RBP,
>> RDI,
>> RSI,
>> RBX,
>> @@ -355,8 +345,7 @@
>> R14);
>>
>> // Class for all int registers except RAX, RDX (and RSP)
>> -reg_class int_no_rax_rdx_reg(RBP,
>> - RDI,
>> +reg_class int_no_rax_rdx_reg(RDI,
>> RSI,
>> RCX,
>> RBX,
>> @@ -718,6 +707,7 @@
>> st->print("# stack bang");
>> st->print("\n\t");
>> st->print("pushq rbp\t# Save rbp");
>> + // BDG consider: st->print("movq rbp, rsp\t# ");
>> if (framesize) {
>> st->print("\n\t");
>> st->print("subq rsp, #%d\t# Create frame",framesize);
>> --- openjdk8clean/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
>> 2014-03-04 02:52:11.000000000 +0000
>> +++ openjdk8/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-11-07
>> 23:57:11.589593723 +0000
>> @@ -5236,6 +5236,7 @@
>> // We always push rbp, so that on return to interpreter rbp,
>> will be
>> // restored correctly and we can correct the stack.
>> push(rbp);
>> + mov(rbp, rsp);
>> // Remove word for ebp
>> framesize -= wordSize;
>>
>> --- openjdk8clean/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
>> 2014-03-04 02:52:10.000000000 +0000
>> +++ openjdk8/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
>> 2014-11-07 23:57:21.933257882 +0000
>> @@ -358,6 +358,7 @@
>> generate_stack_overflow_check(frame_size_in_bytes);
>>
>> push(rbp);
>> + mov(rbp, rsp);
>> #ifdef TIERED
>> // c2 leaves fpu stack dirty. Clean it on entry
>> if (UseSSE < 2 ) {
>>
>>
>> Brendan
More information about the serviceability-dev
mailing list