RFC: C2 Object Initialization - Using XMM/YMM registers

Thu May 31 04:55:51 UTC 2018

Thanks Vladimir,

I made the changes as you had suggested and it works now.
Please find attached the updated patch, relevant test case as well as
the micro-benchmark performance data.
Sorry for the delay.

**************** P A T C H **************

diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
b/src/hotspot/cpu/x86/globals_x86.hpp
--- a/src/hotspot/cpu/x86/globals_x86.hpp
+++ b/src/hotspot/cpu/x86/globals_x86.hpp
@@ -150,6 +150,9 @@
   product(bool, UseUnalignedLoadStores, false,                              \
           "Use SSE2 MOVDQU instruction for Arraycopy")                      \
                                                                             \
+  product(bool, UseXMMForObjInit, false,                                    \
+          "Use XMM/YMM MOVDQU instruction for Object Initialization")       \
+                                                                            \
   product(bool, UseFastStosb, false,                                        \
           "Use fast-string operation for zeroing: rep stosb")               \
                                                                             \
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@@ -6775,7 +6775,58 @@

 }

-void MacroAssembler::clear_mem(Register base, Register cnt, Register
tmp, bool is_large) {
+// clear memory of size 'cnt' qwords, starting at 'base' using
XMM/YMM registers
+void MacroAssembler::xmm_clear_mem(Register base, Register cnt,
XMMRegister xtmp) {
+  // cnt - number of qwords (8-byte words).
+  // base - start address, qword aligned.
+  Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
+  if (UseAVX >= 2)
+    vpxor(xtmp, xtmp, xtmp, AVX_256bit);
+  else
+    vpxor(xtmp, xtmp, xtmp, AVX_128bit);
+  jmp(L_zero_64_bytes);
+
+  BIND(L_loop);
+  if (UseAVX >= 2) {
+    vmovdqu(Address(base,  0), xtmp);
+    vmovdqu(Address(base, 32), xtmp);
+  } else {
+    movdqu(Address(base,  0), xtmp);
+    movdqu(Address(base, 16), xtmp);
+    movdqu(Address(base, 32), xtmp);
+    movdqu(Address(base, 48), xtmp);
+  }
+  addptr(base, 64);
+
+  BIND(L_zero_64_bytes);
+  subptr(cnt, 8);
+  jccb(Assembler::greaterEqual, L_loop);
+  addptr(cnt, 4);
+  jccb(Assembler::less, L_tail);
+  // Copy trailing 32 bytes
+  if (UseAVX >= 2) {
+    vmovdqu(Address(base, 0), xtmp);
+  } else {
+    movdqu(Address(base,  0), xtmp);
+    movdqu(Address(base, 16), xtmp);
+  }
+  addptr(base, 32);
+  subptr(cnt, 4);
+
+  BIND(L_tail);
+  addptr(cnt, 4);
+  jccb(Assembler::lessEqual, L_end);
+  decrement(cnt);
+
+  BIND(L_sloop);
+  movq(Address(base, 0), xtmp);
+  addptr(base, 8);
+  decrement(cnt);
+  jccb(Assembler::greaterEqual, L_sloop);
+  BIND(L_end);
+}
+
+void MacroAssembler::clear_mem(Register base, Register cnt, Register
tmp, XMMRegister xtmp, bool is_large) {
   // cnt - number of qwords (8-byte words).
   // base - start address, qword aligned.
   // is_large - if optimizers know cnt is larger than InitArrayShortSize
@@ -6787,7 +6838,9 @@

   Label DONE;

-  xorptr(tmp, tmp);
+  if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+    xorptr(tmp, tmp);
+  }

   if (!is_large) {
     Label LOOP, LONG;
@@ -6813,6 +6866,9 @@
   if (UseFastStosb) {
     shlptr(cnt, 3); // convert to number of bytes
     rep_stosb();
+  } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+    movptr(tmp, base);
+    xmm_clear_mem(tmp, cnt, xtmp);
   } else {
     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
for 32-bit VM
     rep_stos();
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@@ -1578,7 +1578,10 @@

   // clear memory of size 'cnt' qwords, starting at 'base';
   // if 'is_large' is set, do not try to produce short loop
-  void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
+  void clear_mem(Register base, Register cnt, Register rtmp,
XMMRegister xtmp, bool is_large);
+
+  // clear memory of size 'cnt' qwords, starting at 'base' using
XMM/YMM registers
+  void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);

 #ifdef COMPILER2
   void string_indexof_char(Register str1, Register cnt1, Register ch,
Register result,
diff --git a/src/hotspot/cpu/x86/x86_32.ad b/src/hotspot/cpu/x86/x86_32.ad
--- a/src/hotspot/cpu/x86/x86_32.ad
+++ b/src/hotspot/cpu/x86/x86_32.ad
@@ -11482,13 +11482,15 @@

 // =======================================================================
 // fast clearing of an array
-instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe
dummy, eFlagsReg cr) %{
+instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero,
Universe dummy, eFlagsReg cr) %{
   predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);

   format %{ $$template
-    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+      $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    }
     $$emit$$"CMP    InitArrayShortSize,rcx\n\t"
     $$emit$$"JG     LARGE\n\t"
     $$emit$$"SHL    ECX, 1\n\t"
@@ -11502,6 +11504,32 @@
     if (UseFastStosb) {
        $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+       $$emit$$"MOV     RDI,RAX\n\t"
+       $$emit$$"VPXOR   YMM0,YMM0,YMM0\n\t"
+       $$emit$$"JMPQ    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+       $$emit$$"ADD     0x40,RAX\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"SUB     0x8,RCX\n\t"
+       $$emit$$"JGE     L_loop\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JL      L_tail\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x20,RAX\n\t"
+       $$emit$$"SUB     0x4,RCX\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JLE     L_end\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"VMOVQ   XMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x8,RAX\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"JGE     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
        $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
        $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
@@ -11509,20 +11537,49 @@
     $$emit$$"# DONE"
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero,
Universe dummy, eFlagsReg cr) %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, false);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI
zero, Universe dummy, eFlagsReg cr) %{
   predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
   format %{ $$template
-    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+       $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    }
     if (UseFastStosb) {
        $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+       $$emit$$"MOV     RDI,RAX\n\t"
+       $$emit$$"VPXOR   YMM0,YMM0,YMM0\n\t"
+       $$emit$$"JMPQ    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+       $$emit$$"ADD     0x40,RAX\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"SUB     0x8,RCX\n\t"
+       $$emit$$"JGE     L_loop\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JL      L_tail\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x20,RAX\n\t"
+       $$emit$$"SUB     0x4,RCX\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JLE     L_end\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"VMOVQ   XMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x8,RAX\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"JGE     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
        $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
        $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
@@ -11530,7 +11587,8 @@
     $$emit$$"# DONE"
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, true);
   %}
   ins_pipe( pipe_slow );
 %}
diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -10625,15 +10625,17 @@

 // =======================================================================
 // fast clearing of an array
-instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
-                  rFlagsReg cr)
+instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+                  Universe dummy, rFlagsReg cr)
 %{
   predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);

   format %{ $$template
-    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+       $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    }
     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
     $$emit$$"jg      LARGE\n\t"
     $$emit$$"dec     rcx\n\t"
@@ -10646,35 +10648,91 @@
     if (UseFastStosb) {
        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
+    } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+       $$emit$$"mov     rdi,rax\n\t"
+       $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
+       $$emit$$"jmpq    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+       $$emit$$"add     0x40,rax\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"sub     0x8,rcx\n\t"
+       $$emit$$"jge     L_loop\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jl      L_tail\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"add     0x20,rax\n\t"
+       $$emit$$"sub     0x4,rcx\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jle     L_end\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"vmovq   xmm0,(rax)\n\t"
+       $$emit$$"add     0x8,rax\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"jge     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
     }
     $$emit$$"# DONE"
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, false);
   %}
   ins_pipe(pipe_slow);
 %}

-instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero,
Universe dummy,
-                  rFlagsReg cr)
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+                        Universe dummy, rFlagsReg cr)
 %{
   predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);

   format %{ $$template
-    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    if (!is_large || !(UseXMMForObjInit && UseUnalignedLoadStores)) {
+      $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    }
     if (UseFastStosb) {
        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
+    } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
+       $$emit$$"mov     rdi,rax\n\t"
+       $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
+       $$emit$$"jmpq    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+       $$emit$$"add     0x40,rax\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"sub     0x8,rcx\n\t"
+       $$emit$$"jge     L_loop\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jl      L_tail\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"add     0x20,rax\n\t"
+       $$emit$$"sub     0x4,rcx\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jle     L_end\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"vmovq   xmm0,(rax)\n\t"
+       $$emit$$"add     0x8,rax\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"jge     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
     }
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, true);
   %}
   ins_pipe(pipe_slow);
 %}


*********************** END of P A T C H *******************


Generated assembly code after change:
------------------------------------------------------
 0x00002b771c0016e4: mov    %rdx,%rdi
 0x00002b771c0016e7: add    $0x10,%rdi
 0x00002b771c0016eb: mov    $0x14,%ecx
 0x00002b771c0016f0: mov    %rdi,%rax
 0x00002b771c0016f3: vpxor  %ymm0,%ymm0,%ymm0
 0x00002b771c0016f7: jmpq   0x00002b771c001709
 0x00002b771c0016fc: vmovdqu %ymm0,(%rax)
 0x00002b771c001700: vmovdqu %ymm0,0x20(%rax)
 0x00002b771c001705: add    $0x40,%rax
 0x00002b771c001709: sub    $0x8,%rcx
 0x00002b771c00170d: jge    0x00002b771c0016fc
 0x00002b771c00170f: add    $0x4,%rcx
 0x00002b771c001713: jl     0x00002b771c001721
 0x00002b771c001715: vmovdqu %ymm0,(%rax)
 0x00002b771c001719: add    $0x20,%rax
 0x00002b771c00171d: sub    $0x4,%rcx
 0x00002b771c001721: add    $0x4,%rcx
 0x00002b771c001725: jle    0x00002b771c001737
 0x00002b771c001727: dec    %rcx
 0x00002b771c00172a: vmovq  %xmm0,(%rax)
 0x00002b771c00172e: add    $0x8,%rax
 0x00002b771c001732: dec    %rcx
 0x00002b771c001735: jge    0x00002b771c00172a
 0x00002b771c001737:


I have done regression testing (changeset:
50250:04f9bb270ab8/24May2018) on 32-bit as well as 64-bit builds and
didn't find any regressions.
$make run-test TEST="tier1 tier2" JTREG="JOBS=1"
CONF=linux-x86_64-normal-server-release

Please let me know your comments.

Regards,
Rohit



On Tue, Apr 24, 2018 at 12:33 AM, Vladimir Kozlov
<vladimir.kozlov at oracle.com> wrote:
> Sorry for delay.
>
> In general you can't use arbitrary registers without letting know JIT
> compilers that you use it. It will definitely cause problems.
> You need to pass it as additional XMMRegister argument and described it as
> TEMP in .ad files.
>
> See byte_array_inflate() as example.
>
>
> On 4/11/18 7:25 PM, Rohit Arul Raj wrote:
>>>>
>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>> Saving and Restoring the XMM0 register before and after use works
>>>> fine.
>>>>
>>>> Looking at the  "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>> on Linux ABI, no register is preserved across function calls though
>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>> saving/restoring should be fine.
>>>>
>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>> Using XMM10 register as temporary register works fine without having
>>>> to save and restore it.
>>
>>
>> Any comments/suggestions on the usage of XMM* registers?
>>
>> Thanks,
>> Rohit
>>
>> On Thu, Apr 5, 2018 at 11:38 PM, Vladimir Kozlov
>> <vladimir.kozlov at oracle.com> wrote:
>>>
>>> Good suggestion, Rohit
>>>
>>> I created new RFE. Please add you suggestion and performance data there:
>>>
>>> https://bugs.openjdk.java.net/browse/JDK-8201193
>>>
>>> Thanks,
>>> Vladimir
>>>
>>>
>>> On 4/5/18 12:19 AM, Rohit Arul Raj wrote:
>>>>
>>>>
>>>> Hi All,
>>>>
>>>> I was going through the C2 object initialization (zeroing) code based
>>>> on the below bug entry:
>>>> https://bugs.openjdk.java.net/browse/JDK-8146801
>>>>
>>>> Right now, for longer lengths we use "rep stos" instructions on x86. I
>>>> was experimenting with using XMM/YMM registers (on AMD EPYC processor)
>>>> and found that they do improve performance for certain lengths:
>>>>
>>>> For lengths > 64 bytes - 512 bytes : improvement is in the range of 8%
>>>> to
>>>> 44%
>>>> For lengths > 512bytes                   : some lengths show slight
>>>> improvement in the range of 2% to 7%, others almost same as "rep stos"
>>>> numbers.
>>>>
>>>> I have attached the complete performance data (data.txt) for reference .
>>>> Can we add this as an user option similar to UseXMMForArrayCopy?
>>>>
>>>> I have used the same test case as in
>>>> (http://cr.openjdk.java.net/~shade/8146801/benchmarks.jar) with
>>>> additional sizes.
>>>>
>>>> Initial Patch:
>>>> I haven't added the check for 32-bit mode as I need some help with the
>>>> code (description given below the patch).
>>>> The code is similar to the one used in array copy stubs
>>>> (copy_bytes_forward).
>>>>
>>>> diff --git a/src/hotspot/cpu/x86/globals_x86.hpp
>>>> b/src/hotspot/cpu/x86/globals_x86.hpp
>>>> --- a/src/hotspot/cpu/x86/globals_x86.hpp
>>>> +++ b/src/hotspot/cpu/x86/globals_x86.hpp
>>>> @@ -150,6 +150,9 @@
>>>>      product(bool, UseUnalignedLoadStores, false,
>>>> \
>>>>              "Use SSE2 MOVDQU instruction for Arraycopy")
>>>> \
>>>>
>>>> \
>>>> +  product(bool, UseXMMForObjInit, false,
>>>> \
>>>> +          "Use XMM/YMM MOVDQU instruction for Object Initialization")
>>>> \
>>>> +
>>>> \
>>>>      product(bool, UseFastStosb, false,
>>>> \
>>>>              "Use fast-string operation for zeroing: rep stosb")
>>>> \
>>>>
>>>> \
>>>> diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
>>>> @@ -7106,6 +7106,56 @@
>>>>      if (UseFastStosb) {
>>>>        shlptr(cnt, 3); // convert to number of bytes
>>>>        rep_stosb();
>>>> +  } else if (UseXMMForObjInit && UseUnalignedLoadStores) {
>>>> +    Label L_loop, L_sloop, L_check, L_tail, L_end;
>>>> +    push(base);
>>>> +    if (UseAVX >= 2)
>>>> +      vpxor(xmm10, xmm10, xmm10, AVX_256bit);
>>>> +    else
>>>> +      vpxor(xmm10, xmm10, xmm10, AVX_128bit);
>>>> +
>>>> +    jmp(L_check);
>>>> +
>>>> +    BIND(L_loop);
>>>> +    if (UseAVX >= 2) {
>>>> +      vmovdqu(Address(base,  0), xmm10);
>>>> +      vmovdqu(Address(base, 32), xmm10);
>>>> +    } else {
>>>> +      movdqu(Address(base,  0), xmm10);
>>>> +      movdqu(Address(base, 16), xmm10);
>>>> +      movdqu(Address(base, 32), xmm10);
>>>> +      movdqu(Address(base, 48), xmm10);
>>>> +    }
>>>> +    addptr(base, 64);
>>>> +
>>>> +    BIND(L_check);
>>>> +    subptr(cnt, 8);
>>>> +    jccb(Assembler::greaterEqual, L_loop);
>>>> +    addptr(cnt, 4);
>>>> +    jccb(Assembler::less, L_tail);
>>>> +    // Copy trailing 32 bytes
>>>> +    if (UseAVX >= 2) {
>>>> +      vmovdqu(Address(base, 0), xmm10);
>>>> +    } else {
>>>> +      movdqu(Address(base,  0), xmm10);
>>>> +      movdqu(Address(base, 16), xmm10);
>>>> +    }
>>>> +    addptr(base, 32);
>>>> +    subptr(cnt, 4);
>>>> +
>>>> +    BIND(L_tail);
>>>> +    addptr(cnt, 4);
>>>> +    jccb(Assembler::lessEqual, L_end);
>>>> +    decrement(cnt);
>>>> +
>>>> +    BIND(L_sloop);
>>>> +    movptr(Address(base, 0), tmp);
>>>> +    addptr(base, 8);
>>>> +    decrement(cnt);
>>>> +    jccb(Assembler::greaterEqual, L_sloop);
>>>> +
>>>> +    BIND(L_end);
>>>> +    pop(base);
>>>>      } else {
>>>>        NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words
>>>> for 32-bit VM
>>>>        rep_stos();
>>>>
>>>>
>>>> When I use XMM0 as a temporary register, the micro-benchmark crashes.
>>>> Saving and Restoring the XMM0 register before and after use works
>>>> fine.
>>>>
>>>> Looking at the  "hotspot/src/cpu/x86/vm/x86.ad" file, XMM0 as with
>>>> other XMM registers has been mentioned as Save-On-Call registers and
>>>> on Linux ABI, no register is preserved across function calls though
>>>> XMM0-XMM7 might hold parameters. So I assumed using XMM0 without
>>>> saving/restoring should be fine.
>>>>
>>>> Is it incorrect use XMM* registers without saving/restoring them?
>>>> Using XMM10 register as temporary register works fine without having
>>>> to save and restore it.
>>>>
>>>> Please let me know your comments.
>>>>
>>>> Regards,
>>>> Rohit
>>>>
>>>
>
-------------- next part --------------
+------+-----------+------------------------------------------------------+--------------------------------------------------------------------------+
|      |   Array   | Total  |         JDK11 trunk code ns/op              |              JDK11 trunk - YMM 64b loop ns/op (UseAVX=2)		     |
| S.No |   Size    | Size   +-----------+----------+-----------+----------+--------------------------------------------------------------------------+
|      |           |in bytes|   Const   | variance |  Variable | variance |    Const     | variance |  Variable | variance | %diff Const | %diff Var |
+------+-----------+--------+-----------+----------+-----------+----------+--------------+----------+-----------+----------+-------------+-----------+ 
|   1  |     0     |   0    |   8.43    |   0.28   |    8.97   |   0.01   |     8.37     |   0.33   |    8.96   |   0.00   |    0.70%    |   0.09%   |
|   2  |     1     |   8    |   8.97    |   0.00   |    9.47   |   0.01   |     8.97     |   0.00   |    9.46   |   0.01   |    0.01%    |   0.10%   |
|   3  |     2     |   8    |   8.97    |   0.00   |    9.46   |   0.01   |     8.97     |   0.00   |    9.47   |   0.03   |    0.00%    |   -0.19%  |
|   4  |     4     |   16   |   9.36    |   0.00   |    9.96   |   0.09   |     9.36     |   0.00   |    9.92   |   0.05   |    -0.02%   |   0.41%   |
|   5  |     8     |   32   |   10.26   |   0.01   |   11.02   |   0.02   |    10.26     |   0.00   |   10.82   |   0.41   |    -0.01%   |   1.86%   |
|   6  |     16    |   64   |   12.17   |   0.01   |   13.29   |   0.18   |    12.16     |   0.01   |   13.15   |   0.08   |    0.05%    |   1.02%   |
|   7  |     24    |   96   |   17.47   |   0.28   |   20.54   |   0.15   |    12.20     |   0.23   |   12.62   |   0.06   |    30.15%   |   38.56%  |<==
|   8  |     32    |  128   |   19.94   |   0.28   |   24.14   |   0.11   |    15.27     |   0.05   |   15.51   |   0.06   |    23.41%   |   35.75%  |
|   9  |     40    |  160   |   22.68   |   0.11   |   27.66   |   1.23   |    17.30     |   0.03   |   17.47   |   0.03   |    23.73%   |   36.83%  |
|  10  |     56    |  224   |   31.28   |   0.20   |   36.04   |   0.32   |    26.35     |   0.28   |   26.82   |   0.20   |    15.75%   |   25.57%  |
|  11  |     64    |  256   |   38.62   |   0.42   |   62.32   |   0.15   |    34.48     |   4.25   |   34.42   |   1.24   |    10.71%   |   44.77%  |
|  12  |     96    |  384   |   70.78   |   0.16   |   70.89   |   0.29   |    57.70     |   0.06   |   59.16   |   0.06   |    18.48%   |   16.55%  |
|  13  |    128    |  512   |   77.92   |   0.44   |   78.54   |   0.45   |    77.71     |   0.10   |   76.35   |   0.14   |    0.28%    |   2.78%   |<==
|  14  |    136    |  544   |   80.49   |   0.14   |   82.03   |   0.17   |    76.95     |   4.72   |   79.51   |   5.65   |    4.40%    |   3.07%   |
|  15  |    256    |  1 KB  |  131.03   |   0.23   |   132.21  |   0.40   |    128.17    |   3.01   |   129.65  |   0.65   |    2.18%    |   1.93%   |
|  16  |    512    |  2 KB  |  249.43   |   1.91   |   252.00  |   2.45   |    247.95    |   1.26   |   249.26  |   3.46   |    0.60%    |   1.09%   |
|  17  |    808    |  3 KB  |  411.56   |   4.11   |   412.80  |   1.05   |    403.64    |   1.06   |   399.41  |   1.55   |    1.92%    |   3.24%   |
|  18  |    1024   |  4 KB  |  493.10   |   5.21   |   496.55  |   0.45   |    471.13    |   0.72   |   486.47  |   0.46   |    4.46%    |   2.03%   |
|  19  |    2048   |  8 KB  |  932.67   |   2.80   |   927.03  |   1.30   |    916.10    |   0.89   |   925.54  |   1.29   |    1.78%    |   0.16%   |
|  20  |    4096   | 16 KB  |  1788.73  |   7.96   |  1798.35  |   2.64   |   1785.60    |   5.71   |  1805.24  |   1.74   |    0.18%    |   -0.38%  |
|  21  |    8192   | 32 KB  |  3492.33  |   3.17   |  3503.26  |   4.34   |   3500.11    |   3.48   |  3491.26  |   2.26   |    -0.22%   |   0.34%   |
|  22  |   16384   | 64 KB  |  7033.47  |   13.10  |  7016.00  |   4.89   |   6997.37    |   4.47   |  6980.20  |   8.71   |    0.51%    |   0.51%   |
|  23  |   32768   | 128KB  | 14001.69  |   21.46  |  13995.99 |   23.87  |   13953.19   |   25.24  |  13985.93 |  158.40  |    0.35%    |   0.07%   |
|  24  |   65536   | 256KB  | 28077.64  |   31.21  |  27824.47 |   99.05  |  27928.072   |   17.93  |  27946.04 |   29.16  |    0.53%    |   -0.44%  |
|  25  |   131072  | 512KB  | 50649.21  |  266.61  |  51081.06 |  664.10  |  50758.384   |  182.51  |  51237.59 |  359.25  |    -0.22%   |   -0.31%  |
|  26  |   262144  |  1 MB  | 101764.54 |  3618.86 | 103088.84 |  3324.95 |  102189.665  |  3482.84 | 101844.83 |  3659.51 |    -0.42%   |   1.21%   |
|  27  |   524288  |  2 MB  | 227482.10 |  475.35  | 226808.13 |  611.68  |  227001.467  |  400.84  | 224605.92 |  492.21  |    0.21%    |   0.97%   |
|  28  |  1048576  |  4 MB  | 447029.14 |  691.55  | 448340.14 |  806.06  |  448135.881  |  615.82  | 450134.22 |  687.74  |    -0.25%   |   -0.40%  |
|  29  |  2097152  |  8 MB  | 883839.80 |  1305.72 | 887155.13 |  422.39  |  887501.62   |  1045.55 | 888845.74 |  1289.69 |    -0.41%   |   -0.19%  |
+------+-----------+--------+-----------+----------+-----------+----------+--------------+----------+-----------+----------+-------------+-----------+