RFC: C2 Object Initialization - Using XMM/YMM registers
Vladimir Kozlov
vladimir.kozlov at oracle.com
Wed Jun 6 18:04:15 UTC 2018
Thank you, Rohit
This change looks reasonable. Let me test it.
Thanks,
Vladimir
On 5/30/18 9:55 PM, Rohit Arul Raj wrote:
> Thanks Vladimir,
>
> I made the changes as you had suggested and it works now.
> Please find attached the updated patch, relevant test case as well as
> the micro-benchmark performance data.
> Sorry for the delay.
>
> **************** P A T C H **************
>
> diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
> b/src/hotspot/cpu/x86/globals_x86.hpp
> --- a/src/hotspot/cpu/x86/globals_x86.hpp
> +++ b/src/hotspot/cpu/x86/globals_x86.hpp
> @@ -150,6 +150,9 @@
> product(bool, UseUnalignedLoadStores, false, \
> "Use SSE2 MOVDQU instruction for Arraycopy") \
> \
> + product(bool, UseXMMForObjInit, false, \
> + "Use XMM/YMM MOVDQU instruction for Object Initialization") \
> + \
> product(bool, UseFastStosb, false, \
> "Use fast-string operation for zeroing: rep stosb") \
> \
> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
> b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
> --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
> @@ -6775,7 +6775,58 @@
>
> }
>
> -void MacroAssembler::clear_mem(Register base, Register cnt, Register
> tmp, bool is_large) {
> +// clear memory of size 'cnt' qwords, starting at 'base' using
> XMM/YMM registers
> +void MacroAssembler::xmm_clear_mem(Register base, Register cnt,
> XMMRegister xtmp) {
> + // cnt - number of qwords (8-byte words).
> + // base - start address, qword aligned.
> + Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
> + if (UseAVX >= 2)
> + vpxor(xtmp, xtmp, xtmp, AVX_256bit);
> + else
> + vpxor(xtmp, xtmp, xtmp, AVX_128bit);
> + jmp(L_zero_64_bytes);
> +
> + BIND(L_loop);
> + if (UseAVX >= 2) {
> + vmovdqu(Address(base, 0), xtmp);
> + vmovdqu(Address(base, 32), xtmp);
> + } else {
> + movdqu(Address(base, 0), xtmp);
> + movdqu(Address(base, 16), xtmp);
> + movdqu(Address(base, 32), xtmp);
> + movdqu(Address(base, 48), xtmp);
> + }
> + addptr(base, 64);
> +
> + BIND(L_zero_64_bytes);
> + subptr(cnt, 8);
> + jccb(Assembler::greaterEqual, L_loop);
> + addptr(cnt, 4);
> + jccb(Assembler::less, L_tail);
> + // Copy trailing 32 bytes
> + if (UseAVX >= 2) {
> + vmovdqu(Address(base, 0), xtmp);
> + } else {
> + movdqu(Address(base, 0), xtmp);
> + movdqu(Address(base, 16), xtmp);
> + }
> + addptr(base, 32);
> + subptr(cnt, 4);
> +
> + BIND(L_tail);
> + addptr(cnt, 4);
> + jccb(Assembler::lessEqual, L_end);
> + decrement(cnt);
> +
> + BIND(L_sloop);
> + movq(Address(base, 0), xtmp);
> + addptr(base, 8);
> + decrement(cnt);
> + jccb(Assembler::greaterEqual, L_sloop);
> + BIND(L_end);
> +}
> +
> +void MacroAssembler::clear_mem(Register base, Register cnt, Register
> tmp, XMMRegister xtmp, bool is_large) {
> // cnt - number of qwords (8-byte words).
> // base - start address, qword aligned.
> // is_large - if optimizers know cnt is larger than InitArrayShortSize
> @@ -6787,7 +6838,9 @@
>
> Label DONE;
>
> - xorptr(tmp, tmp);
> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
> + xorptr(tmp, tmp);
> + }
>
> if (!is_large) {
> Label LOOP, LONG;
> @@ -6813,6 +6866,9 @@
> if (UseFastStosb) {
> shlptr(cnt, 3); // convert to number of bytes
> rep_stosb();
> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
> + movptr(tmp, base);
> + xmm_clear_mem(tmp, cnt, xtmp);
> } else {
> NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
> for 32-bit VM
> rep_stos();
> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
> b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
> --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
> @@ -1578,7 +1578,10 @@
>
> // clear memory of size 'cnt' qwords, starting at 'base';
> // if 'is_large' is set, do not try to produce short loop
> - void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
> + void clear_mem(Register base, Register cnt, Register rtmp,
> XMMRegister xtmp, bool is_large);
> +
> + // clear memory of size 'cnt' qwords, starting at 'base' using
> XMM/YMM registers
> + void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);
>
> #ifdef COMPILER2
> void string_indexof_char(Register str1, Register cnt1, Register ch,
> Register result,
> diff --git a/src/hotspot/cpu/x86/x86_32.ad b/src/hotspot/cpu/x86/x86_32.ad
> --- a/src/hotspot/cpu/x86/x86_32.ad
> +++ b/src/hotspot/cpu/x86/x86_32.ad
> @@ -11482,13 +11482,15 @@
>
> // =======================================================================
> // fast clearing of an array
> -instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe
> dummy, eFlagsReg cr) %{
> +instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero,
> Universe dummy, eFlagsReg cr) %{
> predicate(!((ClearArrayNode*)n)->is_large());
> match(Set dummy (ClearArray cnt base));
> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>
> format %{ $$template
> - $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
> + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
> + }
> $$emit$$"CMP InitArrayShortSize,rcx\n\t"
> $$emit$$"JG LARGE\n\t"
> $$emit$$"SHL ECX, 1\n\t"
> @@ -11502,6 +11504,32 @@
> if (UseFastStosb) {
> $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
> $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
> + $$emit$$"MOV RDI,RAX\n\t"
> + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
> + $$emit$$"JMPQ L_zero_64_bytes\n\t"
> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
> + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
> + $$emit$$"ADD 0x40,RAX\n\t"
> + $$emit$$"# L_zero_64_bytes:\n\t"
> + $$emit$$"SUB 0x8,RCX\n\t"
> + $$emit$$"JGE L_loop\n\t"
> + $$emit$$"ADD 0x4,RCX\n\t"
> + $$emit$$"JL L_tail\n\t"
> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
> + $$emit$$"ADD 0x20,RAX\n\t"
> + $$emit$$"SUB 0x4,RCX\n\t"
> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
> + $$emit$$"ADD 0x4,RCX\n\t"
> + $$emit$$"JLE L_end\n\t"
> + $$emit$$"DEC RCX\n\t"
> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
> + $$emit$$"VMOVQ XMM0,(RAX)\n\t"
> + $$emit$$"ADD 0x8,RAX\n\t"
> + $$emit$$"DEC RCX\n\t"
> + $$emit$$"JGE L_sloop\n\t"
> + $$emit$$"# L_end:\n\t"
> } else {
> $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
> $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
> @@ -11509,20 +11537,49 @@
> $$emit$$"# DONE"
> %}
> ins_encode %{
> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
> - %}
> - ins_pipe( pipe_slow );
> -%}
> -
> -instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero,
> Universe dummy, eFlagsReg cr) %{
> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
> + $tmp$$XMMRegister, false);
> + %}
> + ins_pipe( pipe_slow );
> +%}
> +
> +instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI
> zero, Universe dummy, eFlagsReg cr) %{
> predicate(((ClearArrayNode*)n)->is_large());
> match(Set dummy (ClearArray cnt base));
> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
> format %{ $$template
> - $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
> + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
> + }
> if (UseFastStosb) {
> $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
> $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
> + $$emit$$"MOV RDI,RAX\n\t"
> + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
> + $$emit$$"JMPQ L_zero_64_bytes\n\t"
> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
> + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
> + $$emit$$"ADD 0x40,RAX\n\t"
> + $$emit$$"# L_zero_64_bytes:\n\t"
> + $$emit$$"SUB 0x8,RCX\n\t"
> + $$emit$$"JGE L_loop\n\t"
> + $$emit$$"ADD 0x4,RCX\n\t"
> + $$emit$$"JL L_tail\n\t"
> + $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
> + $$emit$$"ADD 0x20,RAX\n\t"
> + $$emit$$"SUB 0x4,RCX\n\t"
> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
> + $$emit$$"ADD 0x4,RCX\n\t"
> + $$emit$$"JLE L_end\n\t"
> + $$emit$$"DEC RCX\n\t"
> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
> + $$emit$$"VMOVQ XMM0,(RAX)\n\t"
> + $$emit$$"ADD 0x8,RAX\n\t"
> + $$emit$$"DEC RCX\n\t"
> + $$emit$$"JGE L_sloop\n\t"
> + $$emit$$"# L_end:\n\t"
> } else {
> $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
> $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
> @@ -11530,7 +11587,8 @@
> $$emit$$"# DONE"
> %}
> ins_encode %{
> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
> + $tmp$$XMMRegister, true);
> %}
> ins_pipe( pipe_slow );
> %}
> diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
> --- a/src/hotspot/cpu/x86/x86_64.ad
> +++ b/src/hotspot/cpu/x86/x86_64.ad
> @@ -10625,15 +10625,17 @@
>
> // =======================================================================
> // fast clearing of an array
> -instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
> - rFlagsReg cr)
> +instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
> + Universe dummy, rFlagsReg cr)
> %{
> predicate(!((ClearArrayNode*)n)->is_large());
> match(Set dummy (ClearArray cnt base));
> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>
> format %{ $$template
> - $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
> + $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
> + }
> $$emit$$"cmp InitArrayShortSize,rcx\n\t"
> $$emit$$"jg LARGE\n\t"
> $$emit$$"dec rcx\n\t"
> @@ -10646,35 +10648,91 @@
> if (UseFastStosb) {
> $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
> $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
> + $$emit$$"mov rdi,rax\n\t"
> + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
> + $$emit$$"jmpq L_zero_64_bytes\n\t"
> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
> + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
> + $$emit$$"add 0x40,rax\n\t"
> + $$emit$$"# L_zero_64_bytes:\n\t"
> + $$emit$$"sub 0x8,rcx\n\t"
> + $$emit$$"jge L_loop\n\t"
> + $$emit$$"add 0x4,rcx\n\t"
> + $$emit$$"jl L_tail\n\t"
> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
> + $$emit$$"add 0x20,rax\n\t"
> + $$emit$$"sub 0x4,rcx\n\t"
> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
> + $$emit$$"add 0x4,rcx\n\t"
> + $$emit$$"jle L_end\n\t"
> + $$emit$$"dec rcx\n\t"
> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
> + $$emit$$"vmovq xmm0,(rax)\n\t"
> + $$emit$$"add 0x8,rax\n\t"
> + $$emit$$"dec rcx\n\t"
> + $$emit$$"jge L_sloop\n\t"
> + $$emit$$"# L_end:\n\t"
> } else {
> $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
> }
> $$emit$$"# DONE"
> %}
> ins_encode %{
> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
> + $tmp$$XMMRegister, false);
> %}
> ins_pipe(pipe_slow);
> %}
>
> -instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero,
> Universe dummy,
> - rFlagsReg cr)
> +instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
> + Universe dummy, rFlagsReg cr)
> %{
> predicate(((ClearArrayNode*)n)->is_large());
> match(Set dummy (ClearArray cnt base));
> - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
> + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
>
> format %{ $$template
> - $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
> + if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
> + $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
> + }
> if (UseFastStosb) {
> $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
> $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
> + $$emit$$"mov rdi,rax\n\t"
> + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
> + $$emit$$"jmpq L_zero_64_bytes\n\t"
> + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
> + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
> + $$emit$$"add 0x40,rax\n\t"
> + $$emit$$"# L_zero_64_bytes:\n\t"
> + $$emit$$"sub 0x8,rcx\n\t"
> + $$emit$$"jge L_loop\n\t"
> + $$emit$$"add 0x4,rcx\n\t"
> + $$emit$$"jl L_tail\n\t"
> + $$emit$$"vmovdqu ymm0,(rax)\n\t"
> + $$emit$$"add 0x20,rax\n\t"
> + $$emit$$"sub 0x4,rcx\n\t"
> + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
> + $$emit$$"add 0x4,rcx\n\t"
> + $$emit$$"jle L_end\n\t"
> + $$emit$$"dec rcx\n\t"
> + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
> + $$emit$$"vmovq xmm0,(rax)\n\t"
> + $$emit$$"add 0x8,rax\n\t"
> + $$emit$$"dec rcx\n\t"
> + $$emit$$"jge L_sloop\n\t"
> + $$emit$$"# L_end:\n\t"
> } else {
> $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
> }
> %}
> ins_encode %{
> - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
> + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
> + $tmp$$XMMRegister, true);
> %}
> ins_pipe(pipe_slow);
> %}
>
>
> *********************** END of P A T C H *******************
>
>
> Generated assembly code after change:
> ------------------------------------------------------
> 0x00002b771c0016e4: mov %rdx,%rdi
> 0x00002b771c0016e7: add $0x10,%rdi
> 0x00002b771c0016eb: mov $0x14,%ecx
> 0x00002b771c0016f0: mov %rdi,%rax
> 0x00002b771c0016f3: vpxor %ymm0,%ymm0,%ymm0
> 0x00002b771c0016f7: jmpq 0x00002b771c001709
> 0x00002b771c0016fc: vmovdqu %ymm0,(%rax)
> 0x00002b771c001700: vmovdqu %ymm0,0x20(%rax)
> 0x00002b771c001705: add $0x40,%rax
> 0x00002b771c001709: sub $0x8,%rcx
> 0x00002b771c00170d: jge 0x00002b771c0016fc
> 0x00002b771c00170f: add $0x4,%rcx
> 0x00002b771c001713: jl 0x00002b771c001721
> 0x00002b771c001715: vmovdqu %ymm0,(%rax)
> 0x00002b771c001719: add $0x20,%rax
> 0x00002b771c00171d: sub $0x4,%rcx
> 0x00002b771c001721: add $0x4,%rcx
> 0x00002b771c001725: jle 0x00002b771c001737
> 0x00002b771c001727: dec %rcx
> 0x00002b771c00172a: vmovq %xmm0,(%rax)
> 0x00002b771c00172e: add $0x8,%rax
> 0x00002b771c001732: dec %rcx
> 0x00002b771c001735: jge 0x00002b771c00172a
> 0x00002b771c001737:
>
>
> I have done regression testing (changeset:
> 50250:04f9bb270ab8/24May2018) on 32-bit as well as 64-bit builds and
> didn't find any regressions.
> $make run-test TEST="tier1 tier2" JTREG="JOBS=1"
> CONF=linux-x86_64-normal-server-release
>
> Please let me know your comments.
>
> Regards,
> Rohit
>
>
>
> On Tue, Apr 24, 2018 at 12:33 AM, Vladimir Kozlov
> <vladimir.kozlov at oracle.com> wrote:
>> Sorry for delay.
>>
>> In general you can't use arbitrary registers without letting know JIT
>> compilers that you use it. It will definitely cause problems.
>> You need to pass it as additional XMMRegister argument and described it as
>> TEMP in .ad files.
>>
>> See byte_array_inflate() as example.
>>
>>
>> On 4/11/18 7:25 PM, Rohit Arul Raj wrote:
>>>>>
>>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>>> Saving and Restoring the XMM0 register before and after use works
>>>>> fine.
>>>>>
>>>>> Looking at the "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>>> on Linux ABI, no register is preserved across function calls though
>>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>>> saving/restoring should be fine.
>>>>>
>>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>>> Using XMM10 register as temporary register works fine without having
>>>>> to save and restore it.
>>>
>>>
>>> Any comments/suggestions on the usage of XMM* registers?
>>>
>>> Thanks,
>>> Rohit
>>>
>>> On Thu, Apr 5, 2018 at 11:38 PM, Vladimir Kozlov
>>> <vladimir.kozlov at oracle.com> wrote:
>>>>
>>>> Good suggestion, Rohit
>>>>
>>>> I created new RFE. Please add you suggestion and performance data there:
>>>>
>>>> https://bugs.openjdk.java.net/browse/JDK-8201193
>>>>
>>>> Thanks,
>>>> Vladimir
>>>>
>>>>
>>>> On 4/5/18 12:19 AM, Rohit Arul Raj wrote:
>>>>>
>>>>>
>>>>> Hi All,
>>>>>
>>>>> I was going through the C2 object initialization (zeroing) code based
>>>>> on the below bug entry:
>>>>> https://bugs.openjdk.java.net/browse/JDK-8146801
>>>>>
>>>>> Right now, for longer lengths we use "rep stos" instructions on x86. I
>>>>> was experimenting with using XMM/YMM registers (on AMD EPYC processor)
>>>>> and found that they do improve performance for certain lengths:
>>>>>
>>>>> For lengths > 64 bytes - 512 bytes : improvement is in the range of 8%
>>>>> to
>>>>> 44%
>>>>> For lengths > 512bytes : some lengths show slight
>>>>> improvement in the range of 2% to 7%, others almost same as "rep stos"
>>>>> numbers.
>>>>>
>>>>> I have attached the complete performance data (data.txt) for reference .
>>>>> Can we add this as an user option similar to UseXMMForArrayCopy?
>>>>>
>>>>> I have used the same test case as in
>>>>> (http://cr.openjdk.java.net/~shade/8146801/benchmarks.jar) with
>>>>> additional sizes.
>>>>>
>>>>> Initial Patch:
>>>>> I haven't added the check for 32-bit mode as I need some help with the
>>>>> code (description given below the patch).
>>>>> The code is similar to the one used in array copy stubs
>>>>> (copy_bytes_forward).
>>>>>
>>>>> diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
>>>>> b/src/hotspot/cpu/x86/globals_x86.hpp
>>>>> --- a/src/hotspot/cpu/x86/globals_x86.hpp
>>>>> +++ b/src/hotspot/cpu/x86/globals_x86.hpp
>>>>> @@ -150,6 +150,9 @@
>>>>> product(bool, UseUnalignedLoadStores, false,
>>>>> \
>>>>> "Use SSE2 MOVDQU instruction for Arraycopy")
>>>>> \
>>>>>
>>>>> \
>>>>> + product(bool, UseXMMForObjInit, false,
>>>>> \
>>>>> + "Use XMM/YMM MOVDQU instruction for Object Initialization")
>>>>> \
>>>>> +
>>>>> \
>>>>> product(bool, UseFastStosb, false,
>>>>> \
>>>>> "Use fast-string operation for zeroing: rep stosb")
>>>>> \
>>>>>
>>>>> \
>>>>> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>> b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>> --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>>> @@ -7106,6 +7106,56 @@
>>>>> if (UseFastStosb) {
>>>>> shlptr(cnt, 3); // convert to number of bytes
>>>>> rep_stosb();
>>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>>> + Label L_loop, L_sloop, L_check, L_tail, L_end;
>>>>> + push(base);
>>>>> + if (UseAVX >= 2)
>>>>> + vpxor(xmm10, xmm10, xmm10, AVX_256bit);
>>>>> + else
>>>>> + vpxor(xmm10, xmm10, xmm10, AVX_128bit);
>>>>> +
>>>>> + jmp(L_check);
>>>>> +
>>>>> + BIND(L_loop);
>>>>> + if (UseAVX >= 2) {
>>>>> + vmovdqu(Address(base, 0), xmm10);
>>>>> + vmovdqu(Address(base, 32), xmm10);
>>>>> + } else {
>>>>> + movdqu(Address(base, 0), xmm10);
>>>>> + movdqu(Address(base, 16), xmm10);
>>>>> + movdqu(Address(base, 32), xmm10);
>>>>> + movdqu(Address(base, 48), xmm10);
>>>>> + }
>>>>> + addptr(base, 64);
>>>>> +
>>>>> + BIND(L_check);
>>>>> + subptr(cnt, 8);
>>>>> + jccb(Assembler::greaterEqual, L_loop);
>>>>> + addptr(cnt, 4);
>>>>> + jccb(Assembler::less, L_tail);
>>>>> + // Copy trailing 32 bytes
>>>>> + if (UseAVX >= 2) {
>>>>> + vmovdqu(Address(base, 0), xmm10);
>>>>> + } else {
>>>>> + movdqu(Address(base, 0), xmm10);
>>>>> + movdqu(Address(base, 16), xmm10);
>>>>> + }
>>>>> + addptr(base, 32);
>>>>> + subptr(cnt, 4);
>>>>> +
>>>>> + BIND(L_tail);
>>>>> + addptr(cnt, 4);
>>>>> + jccb(Assembler::lessEqual, L_end);
>>>>> + decrement(cnt);
>>>>> +
>>>>> + BIND(L_sloop);
>>>>> + movptr(Address(base, 0), tmp);
>>>>> + addptr(base, 8);
>>>>> + decrement(cnt);
>>>>> + jccb(Assembler::greaterEqual, L_sloop);
>>>>> +
>>>>> + BIND(L_end);
>>>>> + pop(base);
>>>>> } else {
>>>>> NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
>>>>> for 32-bit VM
>>>>> rep_stos();
>>>>>
>>>>>
>>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>>> Saving and Restoring the XMM0 register before and after use works
>>>>> fine.
>>>>>
>>>>> Looking at the "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>>> on Linux ABI, no register is preserved across function calls though
>>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>>> saving/restoring should be fine.
>>>>>
>>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>>> Using XMM10 register as temporary register works fine without having
>>>>> to save and restore it.
>>>>>
>>>>> Please let me know your comments.
>>>>>
>>>>> Regards,
>>>>> Rohit
>>>>>
>>>>
>>
More information about the hotspot-dev
mailing list