RFC: 8201193 C2 Object Initialization - Using XMM/YMM registers
Vladimir Kozlov
vladimir.kozlov at oracle.com
Mon Jun 11 20:41:04 UTC 2018
Okay, I will send official RFR.
Thanks,
Vladimir
On 6/9/18 10:00 PM, Rohit Arul Raj wrote:
> Hello Vladimir,
>
> Thank you very much for the review.
>
> Please find my comments in-lined below.
>
>> I hit assert in Assembler::vpxor() when ran on machine which does not have AVX:
>> assert(UseAVX > 0) failed: requires some form of AVX
>> I think you can use pxor() instead of vpxor() in else case:
>> if (UseAVX >= 2) {
>> vpxor(xtmp, xtmp, xtmp, AVX_256bit);
>> } else {
>> pxor(xtmp, xtmp);
>> }
>
> Yes, this change should be fine. My patch would have worked only on
> processors with AVX support.
>
>> EAX zeroing - it is required for UseFastStosb case too.
>>
>> BTW your code is executed only when UseFastStosb is false. Is it intentional? My be add UseFastStosb
>> check in vm_version_x86.cpp code too.
>
> Our AMD processors don't have ERMS (Enhanced REP MOVSB/STOSB) support.
> Hence it would have always been false.
> Your patch has a much better generic check for this.
>
>> You can also do UseXMMForObjInit setting in vm_version_x86.cpp similar to UseBMI1Instructions flag
>> setting to avoid checking UseUnalignedLoadStores and other flags in all places.
>
> Yes, right.
>
>> Here is code with my changes:
>> http://cr.openjdk.java.net/~kvn/8201193/webrev.02/
>
> The patch looks good. We are OK with the updated changes.
>
> Thanks,
> Rohit
>
>>> Thank you, Rohit
>>>
>>> This change looks reasonable. Let me test it.
>>>
>>> Thanks,
>>> Vladimir
>>>
>>> On 5/30/18 9:55 PM, Rohit Arul Raj wrote:
>>>> Thanks Vladimir,
>>>>
>>>> I made the changes as you had suggested and it works now.
>>>> Please find attached the updated patch, relevant test case as well as
>>>> the micro-benchmark performance data.
>>>> Sorry for the delay.
>>>>
>>>> **************** P A T C H **************
>>>>
>>>> diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
>>>> b/src/hotspot/cpu/x86/globals_x86.hpp
>>>> --- a/src/hotspot/cpu/x86/globals_x86.hpp
>>>> +++ b/src/hotspot/cpu/x86/globals_x86.hpp
>>>> @@ -150,6 +150,9 @@
>>>> product(bool, UseUnalignedLoadStores, false, \
>>>> "Use SSE2 MOVDQU instruction for Arraycopy") \
>>>> \
>>>> + product(bool, UseXMMForObjInit, false, \
>>>> + "Use XMM/YMM MOVDQU instruction for Object Initialization") \
>>>> + \
>>>> product(bool, UseFastStosb, false, \
>>>> "Use fast-string operation for zeroing: rep stosb") \
>>>> \
>>>> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> @@ -6775,7 +6775,58 @@
>>>>
>>>> }
>>>>
>>>> -void MacroAssembler::clear_mem(Register base, Register cnt, Register
>>>> tmp, bool is_large) {
>>>> +// clear memory of size 'cnt' qwords, starting at 'base' using
>>>> XMM/YMM registers
>>>> +void MacroAssembler::xmm_clear_mem(Register base, Register cnt,
>>>> XMMRegister xtmp) {
>>>> + // cnt - number of qwords (8-byte words).
>>>> + // base - start address, qword aligned.
>>>> + Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
>>>> + if (UseAVX >= 2)
>>>> + vpxor(xtmp, xtmp, xtmp, AVX_256bit);
>>>> + else
>>>> + vpxor(xtmp, xtmp, xtmp, AVX_128bit);
>>>> + jmp(L_zero_64_bytes);
>>>> +
>>>> + BIND(L_loop);
>>>> + if (UseAVX >= 2) {
>>>> + vmovdqu(Address(base, 0), xtmp);
>>>> + vmovdqu(Address(base, 32), xtmp);
>>>> + } else {
>>>> + movdqu(Address(base, 0), xtmp);
>>>> + movdqu(Address(base, 16), xtmp);
>>>> + movdqu(Address(base, 32), xtmp);
>>>> + movdqu(Address(base, 48), xtmp);
>>>> + }
>>>> + addptr(base, 64);
>>>> +
>>>> + BIND(L_zero_64_bytes);
>>>> + subptr(cnt, 8);
>>>> + jccb(Assembler::greaterEqual, L_loop);
>>>> + addptr(cnt, 4);
>>>> + jccb(Assembler::less, L_tail);
>>>> + // Copy trailing 32 bytes
>>>> + if (UseAVX >= 2) {
>>>> + vmovdqu(Address(base, 0), xtmp);
>>>> + } else {
>>>> + movdqu(Address(base, 0), xtmp);
>>>> + movdqu(Address(base, 16), xtmp);
>>>> + }
>>>> + addptr(base, 32);
>>>> + subptr(cnt, 4);
>>>> +
>>>> + BIND(L_tail);
>>>> + addptr(cnt, 4);
>>>> + jccb(Assembler::lessEqual, L_end);
>>>> + decrement(cnt);
>>>> +
>>>> + BIND(L_sloop);
>>>> + movq(Address(base, 0), xtmp);
>>>> + addptr(base, 8);
>>>> + decrement(cnt);
>>>> + jccb(Assembler::greaterEqual, L_sloop);
>>>> + BIND(L_end);
>>>> +}
>>>> +
>>>> +void MacroAssembler::clear_mem(Register base, Register cnt, Register
>>>> tmp, XMMRegister xtmp, bool is_large) {
>>>> // cnt - number of qwords (8-byte words).
>>>> // base - start address, qword aligned.
>>>> // is_large - if optimizers know cnt is larger than InitArrayShortSize
>>>> @@ -6787,7 +6838,9 @@
>>>>
>>>> Label DONE;
>>>>
>>>> - xorptr(tmp, tmp);
>>>> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
>>>> + xorptr(tmp, tmp);
>>>> + }
>>>>
>>>> if (!is_large) {
>>>> Label LOOP, LONG;
>>>> @@ -6813,6 +6866,9 @@
>>>> if (UseFastStosb) {
>>>> shlptr(cnt, 3); // convert to number of bytes
>>>> rep_stosb();
>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> + movptr(tmp, base);
>>>> + xmm_clear_mem(tmp, cnt, xtmp);
>>>> } else {
>>>> NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
>>>> for 32-bit VM
>>>> rep_stos();
>>>> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
>>>> b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
>>>> --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
>>>> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
>>>> @@ -1578,7 +1578,10 @@
>>>>
>>>> // clear memory of size 'cnt' qwords, starting at 'base';
>>>> // if 'is_large' is set, do not try to produce short loop
>>>> - void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
>>>> + void clear_mem(Register base, Register cnt, Register rtmp,
>>>> XMMRegister xtmp, bool is_large);
>>>> +
>>>> + // clear memory of size 'cnt' qwords, starting at 'base' using
>>>> XMM/YMM registers
>>>> + void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);
>>>>
>>>> #ifdef COMPILER2
>>>> void string_indexof_char(Register str1, Register cnt1, Register ch,
>>>> Register result,
>>>> diff --git a/src/hotspot/cpu/x86/x86_32.ad b/src/hotspot/cpu/x86/x86_32.ad
>>>> --- a/src/hotspot/cpu/x86/x86_32.ad
>>>> +++ b/src/hotspot/cpu/x86/x86_32.ad
>>>> @@ -11482,13 +11482,15 @@
>>>>
>>>> // =======================================================================
>>>> // fast clearing of an array
>>>> -instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe
>>>> dummy, eFlagsReg cr) %{
>>>> +instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero,
>>>> Universe dummy, eFlagsReg cr) %{
>>>> predicate(!((ClearArrayNode*)n)->is_large());
>>>> match(Set dummy (ClearArray cnt base));
>>>> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
>>>> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>>>>
>>>> format %{ $$template
>>>> - $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
>>>> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
>>>> + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
>>>> + }
>>>> $$emit$$"CMP InitArrayShortSize,rcx\n\t"
>>>> $$emit$$"JG LARGE\n\t"
>>>> $$emit$$"SHL ECX, 1\n\t"
>>>> @@ -11502,6 +11504,32 @@
>>>> if (UseFastStosb) {
>>>> $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
>>>> $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> + $$emit$$"MOV RDI,RAX\n\t"
>>>> + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
>>>> + $$emit$$"JMPQ L_zero_64_bytes\n\t"
>>>> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
>>>> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
>>>> + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
>>>> + $$emit$$"ADD 0x40,RAX\n\t"
>>>> + $$emit$$"# L_zero_64_bytes:\n\t"
>>>> + $$emit$$"SUB 0x8,RCX\n\t"
>>>> + $$emit$$"JGE L_loop\n\t"
>>>> + $$emit$$"ADD 0x4,RCX\n\t"
>>>> + $$emit$$"JL L_tail\n\t"
>>>> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
>>>> + $$emit$$"ADD 0x20,RAX\n\t"
>>>> + $$emit$$"SUB 0x4,RCX\n\t"
>>>> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
>>>> + $$emit$$"ADD 0x4,RCX\n\t"
>>>> + $$emit$$"JLE L_end\n\t"
>>>> + $$emit$$"DEC RCX\n\t"
>>>> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
>>>> + $$emit$$"VMOVQ XMM0,(RAX)\n\t"
>>>> + $$emit$$"ADD 0x8,RAX\n\t"
>>>> + $$emit$$"DEC RCX\n\t"
>>>> + $$emit$$"JGE L_sloop\n\t"
>>>> + $$emit$$"# L_end:\n\t"
>>>> } else {
>>>> $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
>>>> $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
>>>> @@ -11509,20 +11537,49 @@
>>>> $$emit$$"# DONE"
>>>> %}
>>>> ins_encode %{
>>>> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
>>>> - %}
>>>> - ins_pipe( pipe_slow );
>>>> -%}
>>>> -
>>>> -instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero,
>>>> Universe dummy, eFlagsReg cr) %{
>>>> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
>>>> + $tmp$$XMMRegister, false);
>>>> + %}
>>>> + ins_pipe( pipe_slow );
>>>> +%}
>>>> +
>>>> +instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI
>>>> zero, Universe dummy, eFlagsReg cr) %{
>>>> predicate(((ClearArrayNode*)n)->is_large());
>>>> match(Set dummy (ClearArray cnt base));
>>>> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
>>>> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>>>> format %{ $$template
>>>> - $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
>>>> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
>>>> + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
>>>> + }
>>>> if (UseFastStosb) {
>>>> $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
>>>> $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> + $$emit$$"MOV RDI,RAX\n\t"
>>>> + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
>>>> + $$emit$$"JMPQ L_zero_64_bytes\n\t"
>>>> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
>>>> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
>>>> + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
>>>> + $$emit$$"ADD 0x40,RAX\n\t"
>>>> + $$emit$$"# L_zero_64_bytes:\n\t"
>>>> + $$emit$$"SUB 0x8,RCX\n\t"
>>>> + $$emit$$"JGE L_loop\n\t"
>>>> + $$emit$$"ADD 0x4,RCX\n\t"
>>>> + $$emit$$"JL L_tail\n\t"
>>>> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
>>>> + $$emit$$"ADD 0x20,RAX\n\t"
>>>> + $$emit$$"SUB 0x4,RCX\n\t"
>>>> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
>>>> + $$emit$$"ADD 0x4,RCX\n\t"
>>>> + $$emit$$"JLE L_end\n\t"
>>>> + $$emit$$"DEC RCX\n\t"
>>>> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
>>>> + $$emit$$"VMOVQ XMM0,(RAX)\n\t"
>>>> + $$emit$$"ADD 0x8,RAX\n\t"
>>>> + $$emit$$"DEC RCX\n\t"
>>>> + $$emit$$"JGE L_sloop\n\t"
>>>> + $$emit$$"# L_end:\n\t"
>>>> } else {
>>>> $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
>>>> $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
>>>> @@ -11530,7 +11587,8 @@
>>>> $$emit$$"# DONE"
>>>> %}
>>>> ins_encode %{
>>>> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
>>>> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
>>>> + $tmp$$XMMRegister, true);
>>>> %}
>>>> ins_pipe( pipe_slow );
>>>> %}
>>>> diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
>>>> --- a/src/hotspot/cpu/x86/x86_64.ad
>>>> +++ b/src/hotspot/cpu/x86/x86_64.ad
>>>> @@ -10625,15 +10625,17 @@
>>>>
>>>> // =======================================================================
>>>> // fast clearing of an array
>>>> -instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
>>>> - rFlagsReg cr)
>>>> +instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
>>>> + Universe dummy, rFlagsReg cr)
>>>> %{
>>>> predicate(!((ClearArrayNode*)n)->is_large());
>>>> match(Set dummy (ClearArray cnt base));
>>>> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
>>>> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>>>>
>>>> format %{ $$template
>>>> - $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
>>>> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
>>>> + $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
>>>> + }
>>>> $$emit$$"cmp InitArrayShortSize,rcx\n\t"
>>>> $$emit$$"jg LARGE\n\t"
>>>> $$emit$$"dec rcx\n\t"
>>>> @@ -10646,35 +10648,91 @@
>>>> if (UseFastStosb) {
>>>> $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
>>>> $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> + $$emit$$"mov rdi,rax\n\t"
>>>> + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
>>>> + $$emit$$"jmpq L_zero_64_bytes\n\t"
>>>> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
>>>> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
>>>> + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
>>>> + $$emit$$"add 0x40,rax\n\t"
>>>> + $$emit$$"# L_zero_64_bytes:\n\t"
>>>> + $$emit$$"sub 0x8,rcx\n\t"
>>>> + $$emit$$"jge L_loop\n\t"
>>>> + $$emit$$"add 0x4,rcx\n\t"
>>>> + $$emit$$"jl L_tail\n\t"
>>>> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
>>>> + $$emit$$"add 0x20,rax\n\t"
>>>> + $$emit$$"sub 0x4,rcx\n\t"
>>>> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
>>>> + $$emit$$"add 0x4,rcx\n\t"
>>>> + $$emit$$"jle L_end\n\t"
>>>> + $$emit$$"dec rcx\n\t"
>>>> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
>>>> + $$emit$$"vmovq xmm0,(rax)\n\t"
>>>> + $$emit$$"add 0x8,rax\n\t"
>>>> + $$emit$$"dec rcx\n\t"
>>>> + $$emit$$"jge L_sloop\n\t"
>>>> + $$emit$$"# L_end:\n\t"
>>>> } else {
>>>> $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
>>>> }
>>>> $$emit$$"# DONE"
>>>> %}
>>>> ins_encode %{
>>>> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
>>>> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
>>>> + $tmp$$XMMRegister, false);
>>>> %}
>>>> ins_pipe(pipe_slow);
>>>> %}
>>>>
>>>> -instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero,
>>>> Universe dummy,
>>>> - rFlagsReg cr)
>>>> +instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
>>>> + Universe dummy, rFlagsReg cr)
>>>> %{
>>>> predicate(((ClearArrayNode*)n)->is_large());
>>>> match(Set dummy (ClearArray cnt base));
>>>> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
>>>> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>>>>
>>>> format %{ $$template
>>>> - $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
>>>> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
>>>> + $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
>>>> + }
>>>> if (UseFastStosb) {
>>>> $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
>>>> $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> + $$emit$$"mov rdi,rax\n\t"
>>>> + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
>>>> + $$emit$$"jmpq L_zero_64_bytes\n\t"
>>>> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
>>>> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
>>>> + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
>>>> + $$emit$$"add 0x40,rax\n\t"
>>>> + $$emit$$"# L_zero_64_bytes:\n\t"
>>>> + $$emit$$"sub 0x8,rcx\n\t"
>>>> + $$emit$$"jge L_loop\n\t"
>>>> + $$emit$$"add 0x4,rcx\n\t"
>>>> + $$emit$$"jl L_tail\n\t"
>>>> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
>>>> + $$emit$$"add 0x20,rax\n\t"
>>>> + $$emit$$"sub 0x4,rcx\n\t"
>>>> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
>>>> + $$emit$$"add 0x4,rcx\n\t"
>>>> + $$emit$$"jle L_end\n\t"
>>>> + $$emit$$"dec rcx\n\t"
>>>> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
>>>> + $$emit$$"vmovq xmm0,(rax)\n\t"
>>>> + $$emit$$"add 0x8,rax\n\t"
>>>> + $$emit$$"dec rcx\n\t"
>>>> + $$emit$$"jge L_sloop\n\t"
>>>> + $$emit$$"# L_end:\n\t"
>>>> } else {
>>>> $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
>>>> }
>>>> %}
>>>> ins_encode %{
>>>> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
>>>> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
>>>> + $tmp$$XMMRegister, true);
>>>> %}
>>>> ins_pipe(pipe_slow);
>>>> %}
>>>>
>>>>
>>>> *********************** END of P A T C H *******************
>>>>
>>>>
>>>> Generated assembly code after change:
>>>> ------------------------------------------------------
>>>> 0x00002b771c0016e4: mov %rdx,%rdi
>>>> 0x00002b771c0016e7: add $0x10,%rdi
>>>> 0x00002b771c0016eb: mov $0x14,%ecx
>>>> 0x00002b771c0016f0: mov %rdi,%rax
>>>> 0x00002b771c0016f3: vpxor %ymm0,%ymm0,%ymm0
>>>> 0x00002b771c0016f7: jmpq 0x00002b771c001709
>>>> 0x00002b771c0016fc: vmovdqu %ymm0,(%rax)
>>>> 0x00002b771c001700: vmovdqu %ymm0,0x20(%rax)
>>>> 0x00002b771c001705: add $0x40,%rax
>>>> 0x00002b771c001709: sub $0x8,%rcx
>>>> 0x00002b771c00170d: jge 0x00002b771c0016fc
>>>> 0x00002b771c00170f: add $0x4,%rcx
>>>> 0x00002b771c001713: jl 0x00002b771c001721
>>>> 0x00002b771c001715: vmovdqu %ymm0,(%rax)
>>>> 0x00002b771c001719: add $0x20,%rax
>>>> 0x00002b771c00171d: sub $0x4,%rcx
>>>> 0x00002b771c001721: add $0x4,%rcx
>>>> 0x00002b771c001725: jle 0x00002b771c001737
>>>> 0x00002b771c001727: dec %rcx
>>>> 0x00002b771c00172a: vmovq %xmm0,(%rax)
>>>> 0x00002b771c00172e: add $0x8,%rax
>>>> 0x00002b771c001732: dec %rcx
>>>> 0x00002b771c001735: jge 0x00002b771c00172a
>>>> 0x00002b771c001737:
>>>>
>>>>
>>>> I have done regression testing (changeset:
>>>> 50250:04f9bb270ab8/24May2018) on 32-bit as well as 64-bit builds and
>>>> didn't find any regressions.
>>>> $make run-test TEST="tier1 tier2" JTREG="JOBS=1"
>>>> CONF=linux-x86_64-normal-server-release
>>>>
>>>> Please let me know your comments.
>>>>
>>>> Regards,
>>>> Rohit
>>>>
>>>>
>>>>
>>>> On Tue, Apr 24, 2018 at 12:33 AM, Vladimir Kozlov
>>>> <vladimir.kozlov at oracle.com> wrote:
>>>>> Sorry for delay.
>>>>>
>>>>> In general you can't use arbitrary registers without letting know JIT
>>>>> compilers that you use it. It will definitely cause problems.
>>>>> You need to pass it as additional XMMRegister argument and described it as
>>>>> TEMP in .ad files.
>>>>>
>>>>> See byte_array_inflate() as example.
>>>>>
>>>>>
>>>>> On 4/11/18 7:25 PM, Rohit Arul Raj wrote:
>>>>>>>>
>>>>>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>>>>>> Saving and Restoring the XMM0 register before and after use works
>>>>>>>> fine.
>>>>>>>>
>>>>>>>> Looking at the "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>>>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>>>>>> on Linux ABI, no register is preserved across function calls though
>>>>>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>>>>>> saving/restoring should be fine.
>>>>>>>>
>>>>>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>>>>>> Using XMM10 register as temporary register works fine without having
>>>>>>>> to save and restore it.
>>>>>>
>>>>>>
>>>>>> Any comments/suggestions on the usage of XMM* registers?
>>>>>>
>>>>>> Thanks,
>>>>>> Rohit
>>>>>>
>>>>>> On Thu, Apr 5, 2018 at 11:38 PM, Vladimir Kozlov
>>>>>> <vladimir.kozlov at oracle.com> wrote:
>>>>>>>
>>>>>>> Good suggestion, Rohit
>>>>>>>
>>>>>>> I created new RFE. Please add you suggestion and performance data there:
>>>>>>>
>>>>>>> https://bugs.openjdk.java.net/browse/JDK-8201193
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Vladimir
>>>>>>>
>>>>>>>
>>>>>>> On 4/5/18 12:19 AM, Rohit Arul Raj wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi All,
>>>>>>>>
>>>>>>>> I was going through the C2 object initialization (zeroing) code based
>>>>>>>> on the below bug entry:
>>>>>>>> https://bugs.openjdk.java.net/browse/JDK-8146801
>>>>>>>>
>>>>>>>> Right now, for longer lengths we use "rep stos" instructions on x86. I
>>>>>>>> was experimenting with using XMM/YMM registers (on AMD EPYC processor)
>>>>>>>> and found that they do improve performance for certain lengths:
>>>>>>>>
>>>>>>>> For lengths > 64 bytes - 512 bytes : improvement is in the range of 8%
>>>>>>>> to
>>>>>>>> 44%
>>>>>>>> For lengths > 512bytes : some lengths show slight
>>>>>>>> improvement in the range of 2% to 7%, others almost same as "rep stos"
>>>>>>>> numbers.
>>>>>>>>
>>>>>>>> I have attached the complete performance data (data.txt) for reference .
>>>>>>>> Can we add this as an user option similar to UseXMMForArrayCopy?
>>>>>>>>
>>>>>>>> I have used the same test case as in
>>>>>>>> (http://cr.openjdk.java.net/~shade/8146801/benchmarks.jar) with
>>>>>>>> additional sizes.
>>>>>>>>
>>>>>>>> Initial Patch:
>>>>>>>> I haven't added the check for 32-bit mode as I need some help with the
>>>>>>>> code (description given below the patch).
>>>>>>>> The code is similar to the one used in array copy stubs
>>>>>>>> (copy_bytes_forward).
>>>>>>>>
>>>>>>>> diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
>>>>>>>> b/src/hotspot/cpu/x86/globals_x86.hpp
>>>>>>>> --- a/src/hotspot/cpu/x86/globals_x86.hpp
>>>>>>>> +++ b/src/hotspot/cpu/x86/globals_x86.hpp
>>>>>>>> @@ -150,6 +150,9 @@
>>>>>>>> product(bool, UseUnalignedLoadStores, false,
>>>>>>>> \
>>>>>>>> "Use SSE2 MOVDQU instruction for Arraycopy")
>>>>>>>> \
>>>>>>>>
>>>>>>>> \
>>>>>>>> + product(bool, UseXMMForObjInit, false,
>>>>>>>> \
>>>>>>>> + "Use XMM/YMM MOVDQU instruction for Object Initialization")
>>>>>>>> \
>>>>>>>> +
>>>>>>>> \
>>>>>>>> product(bool, UseFastStosb, false,
>>>>>>>> \
>>>>>>>> "Use fast-string operation for zeroing: rep stosb")
>>>>>>>> \
>>>>>>>>
>>>>>>>> \
>>>>>>>> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>>>>> b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>>>>> --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>>>>> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>>>>> @@ -7106,6 +7106,56 @@
>>>>>>>> if (UseFastStosb) {
>>>>>>>> shlptr(cnt, 3); // convert to number of bytes
>>>>>>>> rep_stosb();
>>>>>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>>>>>> + Label L_loop, L_sloop, L_check, L_tail, L_end;
>>>>>>>> + push(base);
>>>>>>>> + if (UseAVX >= 2)
>>>>>>>> + vpxor(xmm10, xmm10, xmm10, AVX_256bit);
>>>>>>>> + else
>>>>>>>> + vpxor(xmm10, xmm10, xmm10, AVX_128bit);
>>>>>>>> +
>>>>>>>> + jmp(L_check);
>>>>>>>> +
>>>>>>>> + BIND(L_loop);
>>>>>>>> + if (UseAVX >= 2) {
>>>>>>>> + vmovdqu(Address(base, 0), xmm10);
>>>>>>>> + vmovdqu(Address(base, 32), xmm10);
>>>>>>>> + } else {
>>>>>>>> + movdqu(Address(base, 0), xmm10);
>>>>>>>> + movdqu(Address(base, 16), xmm10);
>>>>>>>> + movdqu(Address(base, 32), xmm10);
>>>>>>>> + movdqu(Address(base, 48), xmm10);
>>>>>>>> + }
>>>>>>>> + addptr(base, 64);
>>>>>>>> +
>>>>>>>> + BIND(L_check);
>>>>>>>> + subptr(cnt, 8);
>>>>>>>> + jccb(Assembler::greaterEqual, L_loop);
>>>>>>>> + addptr(cnt, 4);
>>>>>>>> + jccb(Assembler::less, L_tail);
>>>>>>>> + // Copy trailing 32 bytes
>>>>>>>> + if (UseAVX >= 2) {
>>>>>>>> + vmovdqu(Address(base, 0), xmm10);
>>>>>>>> + } else {
>>>>>>>> + movdqu(Address(base, 0), xmm10);
>>>>>>>> + movdqu(Address(base, 16), xmm10);
>>>>>>>> + }
>>>>>>>> + addptr(base, 32);
>>>>>>>> + subptr(cnt, 4);
>>>>>>>> +
>>>>>>>> + BIND(L_tail);
>>>>>>>> + addptr(cnt, 4);
>>>>>>>> + jccb(Assembler::lessEqual, L_end);
>>>>>>>> + decrement(cnt);
>>>>>>>> +
>>>>>>>> + BIND(L_sloop);
>>>>>>>> + movptr(Address(base, 0), tmp);
>>>>>>>> + addptr(base, 8);
>>>>>>>> + decrement(cnt);
>>>>>>>> + jccb(Assembler::greaterEqual, L_sloop);
>>>>>>>> +
>>>>>>>> + BIND(L_end);
>>>>>>>> + pop(base);
>>>>>>>> } else {
>>>>>>>> NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
>>>>>>>> for 32-bit VM
>>>>>>>> rep_stos();
>>>>>>>>
>>>>>>>>
>>>>>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>>>>>> Saving and Restoring the XMM0 register before and after use works
>>>>>>>> fine.
>>>>>>>>
>>>>>>>> Looking at the "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>>>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>>>>>> on Linux ABI, no register is preserved across function calls though
>>>>>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>>>>>> saving/restoring should be fine.
>>>>>>>>
>>>>>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>>>>>> Using XMM10 register as temporary register works fine without having
>>>>>>>> to save and restore it.
>>>>>>>>
>>>>>>>> Please let me know your comments.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Rohit
>>>>>>>>
>>>>>>>
>>>>>
More information about the hotspot-compiler-dev
mailing list