RFC: C2 Object Initialization - Using XMM/YMM registers
Rohit Arul Raj
rohitarulraj at gmail.com
Thu May 31 04:55:51 UTC 2018
Thanks Vladimir,
I made the changes as you had suggested and it works now.
Please find attached the updated patch, relevant test case as well as
the micro-benchmark performance data.
Sorry for the delay.
**************** P A T C H **************
diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
b/src/hotspot/cpu/x86/globals_x86.hpp
--- a/src/hotspot/cpu/x86/globals_x86.hpp
+++ b/src/hotspot/cpu/x86/globals_x86.hpp
@@ -150,6 +150,9 @@
product(bool, UseUnalignedLoadStores, false, \
"Use SSE2 MOVDQU instruction for Arraycopy") \
\
+ product(bool, UseXMMForObjInit, false, \
+ "Use XMM/YMM MOVDQU instruction for Object Initialization") \
+ \
product(bool, UseFastStosb, false, \
"Use fast-string operation for zeroing: rep stosb") \
\
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@@ -6775,7 +6775,58 @@
}
-void MacroAssembler::clear_mem(Register base, Register cnt, Register
tmp, bool is_large) {
+// clear memory of size 'cnt' qwords, starting at 'base' using
XMM/YMM registers
+void MacroAssembler::xmm_clear_mem(Register base, Register cnt,
XMMRegister xtmp) {
+ // cnt - number of qwords (8-byte words).
+ // base - start address, qword aligned.
+ Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
+ if (UseAVX >= 2)
+ vpxor(xtmp, xtmp, xtmp, AVX_256bit);
+ else
+ vpxor(xtmp, xtmp, xtmp, AVX_128bit);
+ jmp(L_zero_64_bytes);
+
+ BIND(L_loop);
+ if (UseAVX >= 2) {
+ vmovdqu(Address(base, 0), xtmp);
+ vmovdqu(Address(base, 32), xtmp);
+ } else {
+ movdqu(Address(base, 0), xtmp);
+ movdqu(Address(base, 16), xtmp);
+ movdqu(Address(base, 32), xtmp);
+ movdqu(Address(base, 48), xtmp);
+ }
+ addptr(base, 64);
+
+ BIND(L_zero_64_bytes);
+ subptr(cnt, 8);
+ jccb(Assembler::greaterEqual, L_loop);
+ addptr(cnt, 4);
+ jccb(Assembler::less, L_tail);
+ // Copy trailing 32 bytes
+ if (UseAVX >= 2) {
+ vmovdqu(Address(base, 0), xtmp);
+ } else {
+ movdqu(Address(base, 0), xtmp);
+ movdqu(Address(base, 16), xtmp);
+ }
+ addptr(base, 32);
+ subptr(cnt, 4);
+
+ BIND(L_tail);
+ addptr(cnt, 4);
+ jccb(Assembler::lessEqual, L_end);
+ decrement(cnt);
+
+ BIND(L_sloop);
+ movq(Address(base, 0), xtmp);
+ addptr(base, 8);
+ decrement(cnt);
+ jccb(Assembler::greaterEqual, L_sloop);
+ BIND(L_end);
+}
+
+void MacroAssembler::clear_mem(Register base, Register cnt, Register
tmp, XMMRegister xtmp, bool is_large) {
// cnt - number of qwords (8-byte words).
// base - start address, qword aligned.
// is_large - if optimizers know cnt is larger than InitArrayShortSize
@@ -6787,7 +6838,9 @@
Label DONE;
- xorptr(tmp, tmp);
+ if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+ xorptr(tmp, tmp);
+ }
if (!is_large) {
Label LOOP, LONG;
@@ -6813,6 +6866,9 @@
if (UseFastStosb) {
shlptr(cnt, 3); // convert to number of bytes
rep_stosb();
+ } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+ movptr(tmp, base);
+ xmm_clear_mem(tmp, cnt, xtmp);
} else {
NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
for 32-bit VM
rep_stos();
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@@ -1578,7 +1578,10 @@
// clear memory of size 'cnt' qwords, starting at 'base';
// if 'is_large' is set, do not try to produce short loop
- void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
+ void clear_mem(Register base, Register cnt, Register rtmp,
XMMRegister xtmp, bool is_large);
+
+ // clear memory of size 'cnt' qwords, starting at 'base' using
XMM/YMM registers
+ void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);
#ifdef COMPILER2
void string_indexof_char(Register str1, Register cnt1, Register ch,
Register result,
diff --git a/src/hotspot/cpu/x86/x86_32.ad b/src/hotspot/cpu/x86/x86_32.ad
--- a/src/hotspot/cpu/x86/x86_32.ad
+++ b/src/hotspot/cpu/x86/x86_32.ad
@@ -11482,13 +11482,15 @@
// =======================================================================
// fast clearing of an array
-instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe
dummy, eFlagsReg cr) %{
+instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero,
Universe dummy, eFlagsReg cr) %{
predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
- $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
+ if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+ $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
+ }
$$emit$$"CMP InitArrayShortSize,rcx\n\t"
$$emit$$"JG LARGE\n\t"
$$emit$$"SHL ECX, 1\n\t"
@@ -11502,6 +11504,32 @@
if (UseFastStosb) {
$$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
$$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+ } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+ $$emit$$"MOV RDI,RAX\n\t"
+ $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
+ $$emit$$"JMPQ L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+ $$emit$$"ADD 0x40,RAX\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"SUB 0x8,RCX\n\t"
+ $$emit$$"JGE L_loop\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JL L_tail\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x20,RAX\n\t"
+ $$emit$$"SUB 0x4,RCX\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JLE L_end\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"VMOVQ XMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x8,RAX\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"JGE L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
$$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
$$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
@@ -11509,20 +11537,49 @@
$$emit$$"# DONE"
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero,
Universe dummy, eFlagsReg cr) %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, false);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI
zero, Universe dummy, eFlagsReg cr) %{
predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
- $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
+ if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+ $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
+ }
if (UseFastStosb) {
$$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
$$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+ } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+ $$emit$$"MOV RDI,RAX\n\t"
+ $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
+ $$emit$$"JMPQ L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+ $$emit$$"ADD 0x40,RAX\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"SUB 0x8,RCX\n\t"
+ $$emit$$"JGE L_loop\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JL L_tail\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x20,RAX\n\t"
+ $$emit$$"SUB 0x4,RCX\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JLE L_end\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"VMOVQ XMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x8,RAX\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"JGE L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
$$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
$$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
@@ -11530,7 +11587,8 @@
$$emit$$"# DONE"
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, true);
%}
ins_pipe( pipe_slow );
%}
diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -10625,15 +10625,17 @@
// =======================================================================
// fast clearing of an array
-instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
- rFlagsReg cr)
+instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+ Universe dummy, rFlagsReg cr)
%{
predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
- $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
+ if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+ $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
+ }
$$emit$$"cmp InitArrayShortSize,rcx\n\t"
$$emit$$"jg LARGE\n\t"
$$emit$$"dec rcx\n\t"
@@ -10646,35 +10648,91 @@
if (UseFastStosb) {
$$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
$$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
+ } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+ $$emit$$"mov rdi,rax\n\t"
+ $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
+ $$emit$$"jmpq L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+ $$emit$$"add 0x40,rax\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"sub 0x8,rcx\n\t"
+ $$emit$$"jge L_loop\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jl L_tail\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"add 0x20,rax\n\t"
+ $$emit$$"sub 0x4,rcx\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jle L_end\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"vmovq xmm0,(rax)\n\t"
+ $$emit$$"add 0x8,rax\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"jge L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
$$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
}
$$emit$$"# DONE"
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, false);
%}
ins_pipe(pipe_slow);
%}
-instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero,
Universe dummy,
- rFlagsReg cr)
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+ Universe dummy, rFlagsReg cr)
%{
predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
- $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
+ if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+ $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
+ }
if (UseFastStosb) {
$$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
$$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
+ } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+ $$emit$$"mov rdi,rax\n\t"
+ $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
+ $$emit$$"jmpq L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+ $$emit$$"add 0x40,rax\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"sub 0x8,rcx\n\t"
+ $$emit$$"jge L_loop\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jl L_tail\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"add 0x20,rax\n\t"
+ $$emit$$"sub 0x4,rcx\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jle L_end\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"vmovq xmm0,(rax)\n\t"
+ $$emit$$"add 0x8,rax\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"jge L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
$$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
}
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, true);
%}
ins_pipe(pipe_slow);
%}
*********************** END of P A T C H *******************
Generated assembly code after change:
------------------------------------------------------
0x00002b771c0016e4: mov %rdx,%rdi
0x00002b771c0016e7: add $0x10,%rdi
0x00002b771c0016eb: mov $0x14,%ecx
0x00002b771c0016f0: mov %rdi,%rax
0x00002b771c0016f3: vpxor %ymm0,%ymm0,%ymm0
0x00002b771c0016f7: jmpq 0x00002b771c001709
0x00002b771c0016fc: vmovdqu %ymm0,(%rax)
0x00002b771c001700: vmovdqu %ymm0,0x20(%rax)
0x00002b771c001705: add $0x40,%rax
0x00002b771c001709: sub $0x8,%rcx
0x00002b771c00170d: jge 0x00002b771c0016fc
0x00002b771c00170f: add $0x4,%rcx
0x00002b771c001713: jl 0x00002b771c001721
0x00002b771c001715: vmovdqu %ymm0,(%rax)
0x00002b771c001719: add $0x20,%rax
0x00002b771c00171d: sub $0x4,%rcx
0x00002b771c001721: add $0x4,%rcx
0x00002b771c001725: jle 0x00002b771c001737
0x00002b771c001727: dec %rcx
0x00002b771c00172a: vmovq %xmm0,(%rax)
0x00002b771c00172e: add $0x8,%rax
0x00002b771c001732: dec %rcx
0x00002b771c001735: jge 0x00002b771c00172a
0x00002b771c001737:
I have done regression testing (changeset:
50250:04f9bb270ab8/24May2018) on 32-bit as well as 64-bit builds and
didn't find any regressions.
$make run-test TEST="tier1 tier2" JTREG="JOBS=1"
CONF=linux-x86_64-normal-server-release
Please let me know your comments.
Regards,
Rohit
On Tue, Apr 24, 2018 at 12:33 AM, Vladimir Kozlov
<vladimir.kozlov at oracle.com> wrote:
> Sorry for delay.
>
> In general you can't use arbitrary registers without letting know JIT
> compilers that you use it. It will definitely cause problems.
> You need to pass it as additional XMMRegister argument and described it as
> TEMP in .ad files.
>
> See byte_array_inflate() as example.
>
>
> On 4/11/18 7:25 PM, Rohit Arul Raj wrote:
>>>>
>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>> Saving and Restoring the XMM0 register before and after use works
>>>> fine.
>>>>
>>>> Looking at the "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>> on Linux ABI, no register is preserved across function calls though
>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>> saving/restoring should be fine.
>>>>
>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>> Using XMM10 register as temporary register works fine without having
>>>> to save and restore it.
>>
>>
>> Any comments/suggestions on the usage of XMM* registers?
>>
>> Thanks,
>> Rohit
>>
>> On Thu, Apr 5, 2018 at 11:38 PM, Vladimir Kozlov
>> <vladimir.kozlov at oracle.com> wrote:
>>>
>>> Good suggestion, Rohit
>>>
>>> I created new RFE. Please add you suggestion and performance data there:
>>>
>>> https://bugs.openjdk.java.net/browse/JDK-8201193
>>>
>>> Thanks,
>>> Vladimir
>>>
>>>
>>> On 4/5/18 12:19 AM, Rohit Arul Raj wrote:
>>>>
>>>>
>>>> Hi All,
>>>>
>>>> I was going through the C2 object initialization (zeroing) code based
>>>> on the below bug entry:
>>>> https://bugs.openjdk.java.net/browse/JDK-8146801
>>>>
>>>> Right now, for longer lengths we use "rep stos" instructions on x86. I
>>>> was experimenting with using XMM/YMM registers (on AMD EPYC processor)
>>>> and found that they do improve performance for certain lengths:
>>>>
>>>> For lengths > 64 bytes - 512 bytes : improvement is in the range of 8%
>>>> to
>>>> 44%
>>>> For lengths > 512bytes : some lengths show slight
>>>> improvement in the range of 2% to 7%, others almost same as "rep stos"
>>>> numbers.
>>>>
>>>> I have attached the complete performance data (data.txt) for reference .
>>>> Can we add this as an user option similar to UseXMMForArrayCopy?
>>>>
>>>> I have used the same test case as in
>>>> (http://cr.openjdk.java.net/~shade/8146801/benchmarks.jar) with
>>>> additional sizes.
>>>>
>>>> Initial Patch:
>>>> I haven't added the check for 32-bit mode as I need some help with the
>>>> code (description given below the patch).
>>>> The code is similar to the one used in array copy stubs
>>>> (copy_bytes_forward).
>>>>
>>>> diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
>>>> b/src/hotspot/cpu/x86/globals_x86.hpp
>>>> --- a/src/hotspot/cpu/x86/globals_x86.hpp
>>>> +++ b/src/hotspot/cpu/x86/globals_x86.hpp
>>>> @@ -150,6 +150,9 @@
>>>> product(bool, UseUnalignedLoadStores, false,
>>>> \
>>>> "Use SSE2 MOVDQU instruction for Arraycopy")
>>>> \
>>>>
>>>> \
>>>> + product(bool, UseXMMForObjInit, false,
>>>> \
>>>> + "Use XMM/YMM MOVDQU instruction for Object Initialization")
>>>> \
>>>> +
>>>> \
>>>> product(bool, UseFastStosb, false,
>>>> \
>>>> "Use fast-string operation for zeroing: rep stosb")
>>>> \
>>>>
>>>> \
>>>> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> @@ -7106,6 +7106,56 @@
>>>> if (UseFastStosb) {
>>>> shlptr(cnt, 3); // convert to number of bytes
>>>> rep_stosb();
>>>> + } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> + Label L_loop, L_sloop, L_check, L_tail, L_end;
>>>> + push(base);
>>>> + if (UseAVX >= 2)
>>>> + vpxor(xmm10, xmm10, xmm10, AVX_256bit);
>>>> + else
>>>> + vpxor(xmm10, xmm10, xmm10, AVX_128bit);
>>>> +
>>>> + jmp(L_check);
>>>> +
>>>> + BIND(L_loop);
>>>> + if (UseAVX >= 2) {
>>>> + vmovdqu(Address(base, 0), xmm10);
>>>> + vmovdqu(Address(base, 32), xmm10);
>>>> + } else {
>>>> + movdqu(Address(base, 0), xmm10);
>>>> + movdqu(Address(base, 16), xmm10);
>>>> + movdqu(Address(base, 32), xmm10);
>>>> + movdqu(Address(base, 48), xmm10);
>>>> + }
>>>> + addptr(base, 64);
>>>> +
>>>> + BIND(L_check);
>>>> + subptr(cnt, 8);
>>>> + jccb(Assembler::greaterEqual, L_loop);
>>>> + addptr(cnt, 4);
>>>> + jccb(Assembler::less, L_tail);
>>>> + // Copy trailing 32 bytes
>>>> + if (UseAVX >= 2) {
>>>> + vmovdqu(Address(base, 0), xmm10);
>>>> + } else {
>>>> + movdqu(Address(base, 0), xmm10);
>>>> + movdqu(Address(base, 16), xmm10);
>>>> + }
>>>> + addptr(base, 32);
>>>> + subptr(cnt, 4);
>>>> +
>>>> + BIND(L_tail);
>>>> + addptr(cnt, 4);
>>>> + jccb(Assembler::lessEqual, L_end);
>>>> + decrement(cnt);
>>>> +
>>>> + BIND(L_sloop);
>>>> + movptr(Address(base, 0), tmp);
>>>> + addptr(base, 8);
>>>> + decrement(cnt);
>>>> + jccb(Assembler::greaterEqual, L_sloop);
>>>> +
>>>> + BIND(L_end);
>>>> + pop(base);
>>>> } else {
>>>> NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
>>>> for 32-bit VM
>>>> rep_stos();
>>>>
>>>>
>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>> Saving and Restoring the XMM0 register before and after use works
>>>> fine.
>>>>
>>>> Looking at the "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>> on Linux ABI, no register is preserved across function calls though
>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>> saving/restoring should be fine.
>>>>
>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>> Using XMM10 register as temporary register works fine without having
>>>> to save and restore it.
>>>>
>>>> Please let me know your comments.
>>>>
>>>> Regards,
>>>> Rohit
>>>>
>>>
>
-------------- next part --------------
+------+-----------+------------------------------------------------------+--------------------------------------------------------------------------+
| | Array | Total | JDK11 trunk code ns/op | JDK11 trunk - YMM 64b loop ns/op (UseAVX=2) |
| S.No | Size | Size +-----------+----------+-----------+----------+--------------------------------------------------------------------------+
| | |in bytes| Const | variance | Variable | variance | Const | variance | Variable | variance | %diff Const | %diff Var |
+------+-----------+--------+-----------+----------+-----------+----------+--------------+----------+-----------+----------+-------------+-----------+
| 1 | 0 | 0 | 8.43 | 0.28 | 8.97 | 0.01 | 8.37 | 0.33 | 8.96 | 0.00 | 0.70% | 0.09% |
| 2 | 1 | 8 | 8.97 | 0.00 | 9.47 | 0.01 | 8.97 | 0.00 | 9.46 | 0.01 | 0.01% | 0.10% |
| 3 | 2 | 8 | 8.97 | 0.00 | 9.46 | 0.01 | 8.97 | 0.00 | 9.47 | 0.03 | 0.00% | -0.19% |
| 4 | 4 | 16 | 9.36 | 0.00 | 9.96 | 0.09 | 9.36 | 0.00 | 9.92 | 0.05 | -0.02% | 0.41% |
| 5 | 8 | 32 | 10.26 | 0.01 | 11.02 | 0.02 | 10.26 | 0.00 | 10.82 | 0.41 | -0.01% | 1.86% |
| 6 | 16 | 64 | 12.17 | 0.01 | 13.29 | 0.18 | 12.16 | 0.01 | 13.15 | 0.08 | 0.05% | 1.02% |
| 7 | 24 | 96 | 17.47 | 0.28 | 20.54 | 0.15 | 12.20 | 0.23 | 12.62 | 0.06 | 30.15% | 38.56% |<==
| 8 | 32 | 128 | 19.94 | 0.28 | 24.14 | 0.11 | 15.27 | 0.05 | 15.51 | 0.06 | 23.41% | 35.75% |
| 9 | 40 | 160 | 22.68 | 0.11 | 27.66 | 1.23 | 17.30 | 0.03 | 17.47 | 0.03 | 23.73% | 36.83% |
| 10 | 56 | 224 | 31.28 | 0.20 | 36.04 | 0.32 | 26.35 | 0.28 | 26.82 | 0.20 | 15.75% | 25.57% |
| 11 | 64 | 256 | 38.62 | 0.42 | 62.32 | 0.15 | 34.48 | 4.25 | 34.42 | 1.24 | 10.71% | 44.77% |
| 12 | 96 | 384 | 70.78 | 0.16 | 70.89 | 0.29 | 57.70 | 0.06 | 59.16 | 0.06 | 18.48% | 16.55% |
| 13 | 128 | 512 | 77.92 | 0.44 | 78.54 | 0.45 | 77.71 | 0.10 | 76.35 | 0.14 | 0.28% | 2.78% |<==
| 14 | 136 | 544 | 80.49 | 0.14 | 82.03 | 0.17 | 76.95 | 4.72 | 79.51 | 5.65 | 4.40% | 3.07% |
| 15 | 256 | 1 KB | 131.03 | 0.23 | 132.21 | 0.40 | 128.17 | 3.01 | 129.65 | 0.65 | 2.18% | 1.93% |
| 16 | 512 | 2 KB | 249.43 | 1.91 | 252.00 | 2.45 | 247.95 | 1.26 | 249.26 | 3.46 | 0.60% | 1.09% |
| 17 | 808 | 3 KB | 411.56 | 4.11 | 412.80 | 1.05 | 403.64 | 1.06 | 399.41 | 1.55 | 1.92% | 3.24% |
| 18 | 1024 | 4 KB | 493.10 | 5.21 | 496.55 | 0.45 | 471.13 | 0.72 | 486.47 | 0.46 | 4.46% | 2.03% |
| 19 | 2048 | 8 KB | 932.67 | 2.80 | 927.03 | 1.30 | 916.10 | 0.89 | 925.54 | 1.29 | 1.78% | 0.16% |
| 20 | 4096 | 16 KB | 1788.73 | 7.96 | 1798.35 | 2.64 | 1785.60 | 5.71 | 1805.24 | 1.74 | 0.18% | -0.38% |
| 21 | 8192 | 32 KB | 3492.33 | 3.17 | 3503.26 | 4.34 | 3500.11 | 3.48 | 3491.26 | 2.26 | -0.22% | 0.34% |
| 22 | 16384 | 64 KB | 7033.47 | 13.10 | 7016.00 | 4.89 | 6997.37 | 4.47 | 6980.20 | 8.71 | 0.51% | 0.51% |
| 23 | 32768 | 128KB | 14001.69 | 21.46 | 13995.99 | 23.87 | 13953.19 | 25.24 | 13985.93 | 158.40 | 0.35% | 0.07% |
| 24 | 65536 | 256KB | 28077.64 | 31.21 | 27824.47 | 99.05 | 27928.072 | 17.93 | 27946.04 | 29.16 | 0.53% | -0.44% |
| 25 | 131072 | 512KB | 50649.21 | 266.61 | 51081.06 | 664.10 | 50758.384 | 182.51 | 51237.59 | 359.25 | -0.22% | -0.31% |
| 26 | 262144 | 1 MB | 101764.54 | 3618.86 | 103088.84 | 3324.95 | 102189.665 | 3482.84 | 101844.83 | 3659.51 | -0.42% | 1.21% |
| 27 | 524288 | 2 MB | 227482.10 | 475.35 | 226808.13 | 611.68 | 227001.467 | 400.84 | 224605.92 | 492.21 | 0.21% | 0.97% |
| 28 | 1048576 | 4 MB | 447029.14 | 691.55 | 448340.14 | 806.06 | 448135.881 | 615.82 | 450134.22 | 687.74 | -0.25% | -0.40% |
| 29 | 2097152 | 8 MB | 883839.80 | 1305.72 | 887155.13 | 422.39 | 887501.62 | 1045.55 | 888845.74 | 1289.69 | -0.41% | -0.19% |
+------+-----------+--------+-----------+----------+-----------+----------+--------------+----------+-----------+----------+-------------+-----------+
More information about the hotspot-dev
mailing list