RFR: 8359965: Enable paired pushp and popp instruction usage for APX enabled CPUs

Jatin Bhateja jbhateja at openjdk.org
Wed Jul 2 17:47:41 UTC 2025


On Wed, 2 Jul 2025 17:27:41 GMT, Volodymyr Paprotski <vpaprotski at openjdk.org> wrote:

>> For a cleaner interface, I think we can also maintain a RAII style APXPushPopPairTracker in the stub snippets using push/pop instruction sequence and wrap the actual assembler call underneath. The idea here is to catch the balancing error upfront as PPX is purely a performance hint. Instructions with this hint have the same functional semantics as those without. PPX hints set by the compiler that violate the balancing rule may turn off the PPX
>> optimization, but they will not affect program semantics..
>> 
>> 
>> class APXPushPopPairTracker {
>>     private:
>>         int _counter;
>>  
>>     public:
>>         APXPushPopPairTracker() _counter(0) {
>>         }
>> 
>>        ~APXPushPopPairTracker() {
>>            assert(_counter == 0, "Push/pop pair mismatch");
>>         }
>>      
>>         void push(Register reg, bool has_matching_pop) {
>>             if (has_matching_pop && VM_Version::supports_apx_f()) {
>>                Assembler::pushp(reg);
>>                incrementCounter();
>>             } else {
>>                Assembler::push(reg);
>>             }
>>         }
>>         void pop(Register reg, bool has_matching_push) {
>>             if (has_matching_push && VM_Version::supports_apx_f()) {
>>                Assembler::popp(reg);
>>                decrementCounter();
>>             } else {
>>                Assembler::pop(reg);
>>             }
>>         }     
>>         void incrementCounter() {
>>           _counter++;
>>         }
>>         void decrementCounter() {
>>            _counter--;
>>         }
>> }
>
>> For a cleaner interface, I think we can also maintain a RAII style APXPushPopPairTracker ...
> 
> Using the suggested code as a base, Vamsi and I tinkered with the idea some more! Here is what we came up with. This also tracks the correct order of registers being pushed/poped.. (haven't compiled it, so might have some syntax bugs).
> 
> @dholmes-ora would you mind sharing your opinion? We seem to be making things more complicated, but hopefully in a good way?
> 
> Also included a sample usage in a stub.
> 
> 
> #define __ _masm->
> 
> class PushPopTracker {
>    private:
>       int _counter;
>       MacroAssembler *_masm;
>       const int REGS = 32; // Increase as needed
>       int regs[REGS];
>    public:
>       PushPopTracker(MacroAssembler *_masm) : _counter(0), _masm(_masm) {}
>       ~PushPopTracker() {
>          assert(_counter == 0, "Push/pop pair mismatch");
>       }
> 
>       void push(Register reg) {
>          assert(_counter<REGS, "Push/pop overflow");
>          regs[_counter++] = reg.encoding();
>          if (VM_Version::supports_apx_f()) {
>             __ pushp(reg);
>          } else {
>             __ push(reg);
>          }
>       }
>       void pop(Register reg) {
>          assert(_counter>0, "Push/pop underflow");
>          assert(regs[_counter] == reg.encoding(), "Push/pop pair mismatch: %d != %d", regs[_counter], reg.encoding());
>          _counter--;
>          if (VM_Version::supports_apx_f()) {
>             __ popp(reg);
>          } else {
>             __ pop(reg);
>          }
>       }
> }
> 
> address StubGenerator::generate_intpoly_montgomeryMult_P256() {
>   __ align(CodeEntryAlignment);
>   /*...*/
>   address start = __ pc();
>   __ enter();
>   PushPopTracker s(_masm);
>   s.push(r12); //1
>   s.push(r13); //2
>   s.push(r14); //3
>   #ifdef _WIN64
>   s.push(rsi); //4
>   s.push(rdi); //5
>   #endif
>   s.push(rbp); //6
>   __ movq(rbp, rsp);
>   __ andq(rsp, -32);
>   __ subptr(rsp, 32);
>   // Register Map
>   const Register aLimbs  = c_rarg0; // c_rarg0: rdi | rcx
>   const Register bLimbs  = rsi;     // c_rarg1: rsi | rdx
>   const Register rLimbs  = r8;      // c_rarg2: rdx | r8
>   const Register tmp1    = r9;
>   const Register tmp2    = r10;
>   /*...*/
>   __ movq(rsp, rbp);
>   s.pop(rbp); //5
>   #ifdef _WIN64
>   s.pop(rdi); //4
>   s.pop(rsi); //3
>   #endif
>   s.pop(r14); //2
>   s.pop(r13); //1
>   s.pop(r12); //0
>   __ leave();
>   __ ret(0);
>   return start;
> }

@vamsi-parasa, It's better to make this as a subclass of MacroAssembler in src/hotspot/cpu/x86/macroAssembler_x86.hpp and pass Tracker as an argument to push / pop for a cleaner interface.

-------------

PR Review Comment: https://git.openjdk.org/jdk/pull/25889#discussion_r2180636365


More information about the hotspot-compiler-dev mailing list