RFR: Newer AMD 17h (EPYC) Processor family defaults

Mon Oct 16 14:26:40 UTC 2017

Hi,

I ran into a problem touching this area, so I'm hijacking this thread.

 > #ifdef COMPILER2
 > - if (MaxVectorSize > 16) {
 > - // Limit vectors size to 16 bytes on current AMD cpus.

> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>        FLAG_SET_DEFAULT(MaxVectorSize, 16);
>      }
>  #endif // COMPILER2

The limitation of MaxVecorSize to 16 for some processors in this code 
has the sideeffect that the TypeVect::VECTY and mreg2type[Op_VecY] won't 
be initalized even though the platform has the capability.

Type.cpp:~660

[...]
 >   if (Matcher::vector_size_supported(T_FLOAT,4)) {
 >     TypeVect::VECTX = TypeVect::make(T_FLOAT,4);
 >   }
 >   if (Matcher::vector_size_supported(T_FLOAT,8)) {
 >     TypeVect::VECTY = TypeVect::make(T_FLOAT,8);
 >   }
 >   if (Matcher::vector_size_supported(T_FLOAT,16)) {
 >     TypeVect::VECTZ = TypeVect::make(T_FLOAT,16);
 >   }
[...]
 >   mreg2type[Op_VecX] = TypeVect::VECTX;
 >   mreg2type[Op_VecY] = TypeVect::VECTY;
 >   mreg2type[Op_VecZ] = TypeVect::VECTZ;

In the ad-files feature flags (UseAVX etc.) are used to control what 
rules should be matched if it has effects on specific vector registers. 
Here we have a mismatch.

On a platform that supports AVX2 but have MaxVectorSize limited to 16, 
the VM will fail in regalloc when the TypeVect::VECTY/mreg2type[Op_VecY] 
is uninitalized, we will also hit asserts in a few places like: 
assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecY), 
"sanity");

Shouldn't the type initalization in type.cpp be dependent on feature 
flag (UseAVX etc.) instead of MaxVectorLength? (The type for the vector 
registers are initalized if the platform supports them, but they might 
not be used if MaxVectorSize is limited.)

I suggest something like this:

http://cr.openjdk.java.net/~neliasso/maxvectorsize/webrev/

I will open a bug and and a separate RFR if this seems reasonable to you.

Regards,
Nils Eliasson

On 2017-09-22 09:41, Rohit Arul Raj wrote:
> Thanks Vladimir,
>
> On Wed, Sep 20, 2017 at 10:07 PM, Vladimir Kozlov
> <vladimir.kozlov at oracle.com> wrote:
>>>        __ cmpl(rax, 0x80000000);     // Is cpuid(0x80000001) supported?
>>>        __ jcc(Assembler::belowEqual, done);
>>>        __ cmpl(rax, 0x80000004);     // Is cpuid(0x80000005) supported?
>>> -    __ jccb(Assembler::belowEqual, ext_cpuid1);
>>> +   __ jcc(Assembler::belowEqual, ext_cpuid1);
>>
>> Good. You may need to increase size of the buffer too (to be safe) to 1100:
>>
>> static const int stub_size = 1000;
>>
> Please find the updated patch after the requested change.
>
> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
> b/src/cpu/x86/vm/vm_version_x86.cpp
> --- a/src/cpu/x86/vm/vm_version_x86.cpp
> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
> @@ -46,7 +46,7 @@
>   address VM_Version::_cpuinfo_cont_addr = 0;
>
>   static BufferBlob* stub_blob;
> -static const int stub_size = 1000;
> +static const int stub_size = 1100;
>
>   extern "C" {
>     typedef void (*get_cpu_info_stub_t)(void*);
> @@ -70,7 +70,7 @@
>       bool use_evex = FLAG_IS_DEFAULT(UseAVX) || (UseAVX > 2);
>
>       Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
> -    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
> done, wrapup;
> +    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
> ext_cpuid8, done, wrapup;
>       Label legacy_setup, save_restore_except, legacy_save_restore,
> start_simd_check;
>
>       StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
> @@ -267,14 +267,30 @@
>       __ cmpl(rax, 0x80000000);     // Is cpuid(0x80000001) supported?
>       __ jcc(Assembler::belowEqual, done);
>       __ cmpl(rax, 0x80000004);     // Is cpuid(0x80000005) supported?
> -    __ jccb(Assembler::belowEqual, ext_cpuid1);
> +    __ jcc(Assembler::belowEqual, ext_cpuid1);
>       __ cmpl(rax, 0x80000006);     // Is cpuid(0x80000007) supported?
>       __ jccb(Assembler::belowEqual, ext_cpuid5);
>       __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008) supported?
>       __ jccb(Assembler::belowEqual, ext_cpuid7);
> +    __ cmpl(rax, 0x80000008);     // Is cpuid(0x80000009 and above) supported?
> +    __ jccb(Assembler::belowEqual, ext_cpuid8);
> +    __ cmpl(rax, 0x8000001E);     // Is cpuid(0x8000001E) supported?
> +    __ jccb(Assembler::below, ext_cpuid8);
> +    //
> +    // Extended cpuid(0x8000001E)
> +    //
> +    __ movl(rax, 0x8000001E);
> +    __ cpuid();
> +    __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid1E_offset())));
> +    __ movl(Address(rsi, 0), rax);
> +    __ movl(Address(rsi, 4), rbx);
> +    __ movl(Address(rsi, 8), rcx);
> +    __ movl(Address(rsi,12), rdx);
> +
>       //
>       // Extended cpuid(0x80000008)
>       //
> +    __ bind(ext_cpuid8);
>       __ movl(rax, 0x80000008);
>       __ cpuid();
>       __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid8_offset())));
> @@ -1109,11 +1125,27 @@
>       }
>
>   #ifdef COMPILER2
> -    if (MaxVectorSize > 16) {
> -      // Limit vectors size to 16 bytes on current AMD cpus.
> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>         FLAG_SET_DEFAULT(MaxVectorSize, 16);
>       }
>   #endif // COMPILER2
> +
> +    // Some defaults for AMD family 17h
> +    if ( cpu_family() == 0x17 ) {
> +      // On family 17h processors use XMM and UnalignedLoadStores for
> Array Copy
> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
> +        FLAG_SET_DEFAULT(UseXMMForArrayCopy, true);
> +      }
> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
> +        FLAG_SET_DEFAULT(UseUnalignedLoadStores, true);
> +      }
> +#ifdef COMPILER2
> +      if (supports_sse4_2() && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
> +        FLAG_SET_DEFAULT(UseFPUForSpilling, true);
> +      }
> +#endif
> +    }
>     }
>
>     if( is_intel() ) { // Intel cpus specific settings
> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
> b/src/cpu/x86/vm/vm_version_x86.hpp
> --- a/src/cpu/x86/vm/vm_version_x86.hpp
> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
> @@ -228,6 +228,15 @@
>       } bits;
>     };
>
> +  union ExtCpuid1EEbx {
> +    uint32_t value;
> +    struct {
> +      uint32_t                  : 8,
> +               threads_per_core : 8,
> +                                : 16;
> +    } bits;
> +  };
> +
>     union XemXcr0Eax {
>       uint32_t value;
>       struct {
> @@ -398,6 +407,12 @@
>       ExtCpuid8Ecx ext_cpuid8_ecx;
>       uint32_t     ext_cpuid8_edx; // reserved
>
> +    // cpuid function 0x8000001E // AMD 17h
> +    uint32_t      ext_cpuid1E_eax;
> +    ExtCpuid1EEbx ext_cpuid1E_ebx; // threads per core (AMD17h)
> +    uint32_t      ext_cpuid1E_ecx;
> +    uint32_t      ext_cpuid1E_edx; // unused currently
> +
>       // extended control register XCR0 (the XFEATURE_ENABLED_MASK register)
>       XemXcr0Eax   xem_xcr0_eax;
>       uint32_t     xem_xcr0_edx; // reserved
> @@ -505,6 +520,14 @@
>         result |= CPU_CLMUL;
>       if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>         result |= CPU_RTM;
> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
> +       result |= CPU_ADX;
> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
> +      result |= CPU_BMI2;
> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
> +      result |= CPU_SHA;
> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
> +      result |= CPU_FMA;
>
>       // AMD features.
>       if (is_amd()) {
> @@ -518,16 +541,8 @@
>       }
>       // Intel features.
>       if(is_intel()) {
> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
> -         result |= CPU_ADX;
> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
> -        result |= CPU_BMI2;
> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
> -        result |= CPU_SHA;
>         if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>           result |= CPU_LZCNT;
> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
> -        result |= CPU_FMA;
>         // for Intel, ecx.bits.misalignsse bit (bit 8) indicates
> support for prefetchw
>         if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>           result |= CPU_3DNOW_PREFETCH;
> @@ -590,6 +605,7 @@
>     static ByteSize ext_cpuid5_offset() { return
> byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
>     static ByteSize ext_cpuid7_offset() { return
> byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
>     static ByteSize ext_cpuid8_offset() { return
> byte_offset_of(CpuidInfo, ext_cpuid8_eax); }
> +  static ByteSize ext_cpuid1E_offset() { return
> byte_offset_of(CpuidInfo, ext_cpuid1E_eax); }
>     static ByteSize tpl_cpuidB0_offset() { return
> byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
>     static ByteSize tpl_cpuidB1_offset() { return
> byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
>     static ByteSize tpl_cpuidB2_offset() { return
> byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
> @@ -673,8 +689,12 @@
>       if (is_intel() && supports_processor_topology()) {
>         result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>       } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
> -      result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
> -               cores_per_cpu();
> +      if (cpu_family() >= 0x17) {
> +        result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
> +      } else {
> +        result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
> +                 cores_per_cpu();
> +      }
>       }
>       return (result == 0 ? 1 : result);
>     }
>
> Regards,
> Rohit
>
>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>> @@ -70,7 +70,7 @@
>>>        bool use_evex = FLAG_IS_DEFAULT(UseAVX) || (UseAVX > 2);
>>>
>>>        Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
>>> -    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
>>> done, wrapup;
>>> +    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
>>> ext_cpuid8, done, wrapup;
>>>        Label legacy_setup, save_restore_except, legacy_save_restore,
>>> start_simd_check;
>>>
>>>        StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
>>> @@ -267,14 +267,30 @@
>>>        __ cmpl(rax, 0x80000000);     // Is cpuid(0x80000001) supported?
>>>        __ jcc(Assembler::belowEqual, done);
>>>        __ cmpl(rax, 0x80000004);     // Is cpuid(0x80000005) supported?
>>> -    __ jccb(Assembler::belowEqual, ext_cpuid1);
>>> +    __ jcc(Assembler::belowEqual, ext_cpuid1);
>>>        __ cmpl(rax, 0x80000006);     // Is cpuid(0x80000007) supported?
>>>        __ jccb(Assembler::belowEqual, ext_cpuid5);
>>>        __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008) supported?
>>>        __ jccb(Assembler::belowEqual, ext_cpuid7);
>>> +    __ cmpl(rax, 0x80000008);     // Is cpuid(0x80000009 and above)
>>> supported?
>>> +    __ jccb(Assembler::belowEqual, ext_cpuid8);
>>> +    __ cmpl(rax, 0x8000001E);     // Is cpuid(0x8000001E) supported?
>>> +    __ jccb(Assembler::below, ext_cpuid8);
>>> +    //
>>> +    // Extended cpuid(0x8000001E)
>>> +    //
>>> +    __ movl(rax, 0x8000001E);
>>> +    __ cpuid();
>>> +    __ lea(rsi, Address(rbp,
>>> in_bytes(VM_Version::ext_cpuid1E_offset())));
>>> +    __ movl(Address(rsi, 0), rax);
>>> +    __ movl(Address(rsi, 4), rbx);
>>> +    __ movl(Address(rsi, 8), rcx);
>>> +    __ movl(Address(rsi,12), rdx);
>>> +
>>>        //
>>>        // Extended cpuid(0x80000008)
>>>        //
>>> +    __ bind(ext_cpuid8);
>>>        __ movl(rax, 0x80000008);
>>>        __ cpuid();
>>>        __ lea(rsi, Address(rbp,
>>> in_bytes(VM_Version::ext_cpuid8_offset())));
>>> @@ -1109,11 +1125,27 @@
>>>        }
>>>
>>>    #ifdef COMPILER2
>>> -    if (MaxVectorSize > 16) {
>>> -      // Limit vectors size to 16 bytes on current AMD cpus.
>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>>>          FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>        }
>>>    #endif // COMPILER2
>>> +
>>> +    // Some defaults for AMD family 17h
>>> +    if ( cpu_family() == 0x17 ) {
>>> +      // On family 17h processors use XMM and UnalignedLoadStores for
>>> Array Copy
>>> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>>> +        FLAG_SET_DEFAULT(UseXMMForArrayCopy, true);
>>> +      }
>>> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
>>> +        FLAG_SET_DEFAULT(UseUnalignedLoadStores, true);
>>> +      }
>>> +#ifdef COMPILER2
>>> +      if (supports_sse4_2() && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>> +        FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>> +      }
>>> +#endif
>>> +    }
>>>      }
>>>
>>>      if( is_intel() ) { // Intel cpus specific settings
>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>> @@ -228,6 +228,15 @@
>>>        } bits;
>>>      };
>>>
>>> +  union ExtCpuid1EEbx {
>>> +    uint32_t value;
>>> +    struct {
>>> +      uint32_t                  : 8,
>>> +               threads_per_core : 8,
>>> +                                : 16;
>>> +    } bits;
>>> +  };
>>> +
>>>      union XemXcr0Eax {
>>>        uint32_t value;
>>>        struct {
>>> @@ -398,6 +407,12 @@
>>>        ExtCpuid8Ecx ext_cpuid8_ecx;
>>>        uint32_t     ext_cpuid8_edx; // reserved
>>>
>>> +    // cpuid function 0x8000001E // AMD 17h
>>> +    uint32_t      ext_cpuid1E_eax;
>>> +    ExtCpuid1EEbx ext_cpuid1E_ebx; // threads per core (AMD17h)
>>> +    uint32_t      ext_cpuid1E_ecx;
>>> +    uint32_t      ext_cpuid1E_edx; // unused currently
>>> +
>>>        // extended control register XCR0 (the XFEATURE_ENABLED_MASK
>>> register)
>>>        XemXcr0Eax   xem_xcr0_eax;
>>>        uint32_t     xem_xcr0_edx; // reserved
>>> @@ -505,6 +520,14 @@
>>>          result |= CPU_CLMUL;
>>>        if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>>>          result |= CPU_RTM;
>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>> +       result |= CPU_ADX;
>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>> +      result |= CPU_BMI2;
>>> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>> +      result |= CPU_SHA;
>>> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>> +      result |= CPU_FMA;
>>>
>>>        // AMD features.
>>>        if (is_amd()) {
>>> @@ -518,16 +541,8 @@
>>>        }
>>>        // Intel features.
>>>        if(is_intel()) {
>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>> -         result |= CPU_ADX;
>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>> -        result |= CPU_BMI2;
>>> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>> -        result |= CPU_SHA;
>>>          if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>>>            result |= CPU_LZCNT;
>>> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>> -        result |= CPU_FMA;
>>>          // for Intel, ecx.bits.misalignsse bit (bit 8) indicates
>>> support for prefetchw
>>>          if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>>>            result |= CPU_3DNOW_PREFETCH;
>>> @@ -590,6 +605,7 @@
>>>      static ByteSize ext_cpuid5_offset() { return
>>> byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
>>>      static ByteSize ext_cpuid7_offset() { return
>>> byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
>>>      static ByteSize ext_cpuid8_offset() { return
>>> byte_offset_of(CpuidInfo, ext_cpuid8_eax); }
>>> +  static ByteSize ext_cpuid1E_offset() { return
>>> byte_offset_of(CpuidInfo, ext_cpuid1E_eax); }
>>>      static ByteSize tpl_cpuidB0_offset() { return
>>> byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
>>>      static ByteSize tpl_cpuidB1_offset() { return
>>> byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
>>>      static ByteSize tpl_cpuidB2_offset() { return
>>> byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
>>> @@ -673,8 +689,12 @@
>>>        if (is_intel() && supports_processor_topology()) {
>>>          result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>>>        } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>>> -      result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
>>> -               cores_per_cpu();
>>> +      if (cpu_family() >= 0x17) {
>>> +        result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core + 1;
>>> +      } else {
>>> +        result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
>>> +                 cores_per_cpu();
>>> +      }
>>>        }
>>>        return (result == 0 ? 1 : result);
>>>      }
>>>
>>> Please let me know your comments.
>>> Thanks for your review.
>>>
>>> Regards,
>>> Rohit
>>>
>>>>
>>>> On 9/11/17 9:52 PM, Rohit Arul Raj wrote:
>>>>>
>>>>> Hello David,
>>>>>
>>>>>>>
>>>>>>> 1. ExtCpuid1EEx
>>>>>>>
>>>>>>> Should this be ExtCpuid1EEbx? (I see the naming here is somewhat
>>>>>>> inconsistent - and potentially confusing: I would have preferred to
>>>>>>> see
>>>>>>> things like ExtCpuid_1E_Ebx, to make it clear.)
>>>>>>
>>>>>>
>>>>>> Yes, I can change it accordingly.
>>>>>>
>>>>> I have attached the updated, re-tested patch as per your comments above.
>>>>>
>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>> @@ -70,7 +70,7 @@
>>>>>         bool use_evex = FLAG_IS_DEFAULT(UseAVX) || (UseAVX > 2);
>>>>>
>>>>>         Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
>>>>> -    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
>>>>> done, wrapup;
>>>>> +    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
>>>>> ext_cpuid8, done, wrapup;
>>>>>         Label legacy_setup, save_restore_except, legacy_save_restore,
>>>>> start_simd_check;
>>>>>
>>>>>         StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
>>>>> @@ -272,9 +272,23 @@
>>>>>         __ jccb(Assembler::belowEqual, ext_cpuid5);
>>>>>         __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008) supported?
>>>>>         __ jccb(Assembler::belowEqual, ext_cpuid7);
>>>>> +    __ cmpl(rax, 0x80000008);     // Is cpuid(0x8000001E) supported?
>>>>> +    __ jccb(Assembler::belowEqual, ext_cpuid8);
>>>>> +    //
>>>>> +    // Extended cpuid(0x8000001E)
>>>>> +    //
>>>>> +    __ movl(rax, 0x8000001E);
>>>>> +    __ cpuid();
>>>>> +    __ lea(rsi, Address(rbp,
>>>>> in_bytes(VM_Version::ext_cpuid_1E_offset())));
>>>>> +    __ movl(Address(rsi, 0), rax);
>>>>> +    __ movl(Address(rsi, 4), rbx);
>>>>> +    __ movl(Address(rsi, 8), rcx);
>>>>> +    __ movl(Address(rsi,12), rdx);
>>>>> +
>>>>>         //
>>>>>         // Extended cpuid(0x80000008)
>>>>>         //
>>>>> +    __ bind(ext_cpuid8);
>>>>>         __ movl(rax, 0x80000008);
>>>>>         __ cpuid();
>>>>>         __ lea(rsi, Address(rbp,
>>>>> in_bytes(VM_Version::ext_cpuid8_offset())));
>>>>> @@ -1109,11 +1123,27 @@
>>>>>         }
>>>>>
>>>>>     #ifdef COMPILER2
>>>>> -    if (MaxVectorSize > 16) {
>>>>> -      // Limit vectors size to 16 bytes on current AMD cpus.
>>>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>>>> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>>>>>           FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>>>         }
>>>>>     #endif // COMPILER2
>>>>> +
>>>>> +    // Some defaults for AMD family 17h
>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>> +      // On family 17h processors use XMM and UnalignedLoadStores for
>>>>> Array Copy
>>>>> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>>>>> +        FLAG_SET_DEFAULT(UseXMMForArrayCopy, true);
>>>>> +      }
>>>>> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
>>>>> +        FLAG_SET_DEFAULT(UseUnalignedLoadStores, true);
>>>>> +      }
>>>>> +#ifdef COMPILER2
>>>>> +      if (supports_sse4_2() && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>>>> +        FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>>>> +      }
>>>>> +#endif
>>>>> +    }
>>>>>       }
>>>>>
>>>>>       if( is_intel() ) { // Intel cpus specific settings
>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>> @@ -228,6 +228,15 @@
>>>>>         } bits;
>>>>>       };
>>>>>
>>>>> +  union ExtCpuid_1E_Ebx {
>>>>> +    uint32_t value;
>>>>> +    struct {
>>>>> +      uint32_t                  : 8,
>>>>> +               threads_per_core : 8,
>>>>> +                                : 16;
>>>>> +    } bits;
>>>>> +  };
>>>>> +
>>>>>       union XemXcr0Eax {
>>>>>         uint32_t value;
>>>>>         struct {
>>>>> @@ -398,6 +407,12 @@
>>>>>         ExtCpuid8Ecx ext_cpuid8_ecx;
>>>>>         uint32_t     ext_cpuid8_edx; // reserved
>>>>>
>>>>> +    // cpuid function 0x8000001E // AMD 17h
>>>>> +    uint32_t        ext_cpuid_1E_eax;
>>>>> +    ExtCpuid_1E_Ebx ext_cpuid_1E_ebx; // threads per core (AMD17h)
>>>>> +    uint32_t        ext_cpuid_1E_ecx;
>>>>> +    uint32_t        ext_cpuid_1E_edx; // unused currently
>>>>> +
>>>>>         // extended control register XCR0 (the XFEATURE_ENABLED_MASK
>>>>> register)
>>>>>         XemXcr0Eax   xem_xcr0_eax;
>>>>>         uint32_t     xem_xcr0_edx; // reserved
>>>>> @@ -505,6 +520,14 @@
>>>>>           result |= CPU_CLMUL;
>>>>>         if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>>>>>           result |= CPU_RTM;
>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>> +       result |= CPU_ADX;
>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>> +      result |= CPU_BMI2;
>>>>> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>> +      result |= CPU_SHA;
>>>>> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>> +      result |= CPU_FMA;
>>>>>
>>>>>         // AMD features.
>>>>>         if (is_amd()) {
>>>>> @@ -518,16 +541,8 @@
>>>>>         }
>>>>>         // Intel features.
>>>>>         if(is_intel()) {
>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>> -         result |= CPU_ADX;
>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>> -        result |= CPU_BMI2;
>>>>> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>> -        result |= CPU_SHA;
>>>>>           if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>>>>>             result |= CPU_LZCNT;
>>>>> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>> -        result |= CPU_FMA;
>>>>>           // for Intel, ecx.bits.misalignsse bit (bit 8) indicates
>>>>> support for prefetchw
>>>>>           if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>>>>>             result |= CPU_3DNOW_PREFETCH;
>>>>> @@ -590,6 +605,7 @@
>>>>>       static ByteSize ext_cpuid5_offset() { return
>>>>> byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
>>>>>       static ByteSize ext_cpuid7_offset() { return
>>>>> byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
>>>>>       static ByteSize ext_cpuid8_offset() { return
>>>>> byte_offset_of(CpuidInfo, ext_cpuid8_eax); }
>>>>> +  static ByteSize ext_cpuid_1E_offset() { return
>>>>> byte_offset_of(CpuidInfo, ext_cpuid_1E_eax); }
>>>>>       static ByteSize tpl_cpuidB0_offset() { return
>>>>> byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
>>>>>       static ByteSize tpl_cpuidB1_offset() { return
>>>>> byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
>>>>>       static ByteSize tpl_cpuidB2_offset() { return
>>>>> byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
>>>>> @@ -673,8 +689,11 @@
>>>>>         if (is_intel() && supports_processor_topology()) {
>>>>>           result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>>>>>         } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>>>>> -      result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
>>>>> -               cores_per_cpu();
>>>>> +      if (cpu_family() >= 0x17)
>>>>> +        result = _cpuid_info.ext_cpuid_1E_ebx.bits.threads_per_core +
>>>>> 1;
>>>>> +      else
>>>>> +        result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
>>>>> +                 cores_per_cpu();
>>>>>         }
>>>>>         return (result == 0 ? 1 : result);
>>>>>       }
>>>>>
>>>>>
>>>>> Please let me know your comments
>>>>>
>>>>> Thanks for your time.
>>>>>
>>>>> Regards,
>>>>> Rohit
>>>>>
>>>>>
>>>>>>> Thanks,
>>>>>>> David
>>>>>>> -----
>>>>>>>
>>>>>>>
>>>>>>>> Reference:
>>>>>>>>
>>>>>>>>
>>>>>>>> https://support.amd.com/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf
>>>>>>>> [Pg 82]
>>>>>>>>
>>>>>>>>         CPUID_Fn8000001E_EBX [Core Identifiers] (CoreId)
>>>>>>>>           15:8 ThreadsPerCore: threads per core. Read-only. Reset:
>>>>>>>> XXh.
>>>>>>>> The number of threads per core is ThreadsPerCore+1.
>>>>>>>>
>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>> @@ -70,7 +70,7 @@
>>>>>>>>          bool use_evex = FLAG_IS_DEFAULT(UseAVX) || (UseAVX > 2);
>>>>>>>>
>>>>>>>>          Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
>>>>>>>> -    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
>>>>>>>> done, wrapup;
>>>>>>>> +    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7,
>>>>>>>> ext_cpuid8, done, wrapup;
>>>>>>>>          Label legacy_setup, save_restore_except, legacy_save_restore,
>>>>>>>> start_simd_check;
>>>>>>>>
>>>>>>>>          StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
>>>>>>>> @@ -272,9 +272,23 @@
>>>>>>>>          __ jccb(Assembler::belowEqual, ext_cpuid5);
>>>>>>>>          __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008)
>>>>>>>> supported?
>>>>>>>>          __ jccb(Assembler::belowEqual, ext_cpuid7);
>>>>>>>> +    __ cmpl(rax, 0x80000008);     // Is cpuid(0x8000001E) supported?
>>>>>>>> +    __ jccb(Assembler::belowEqual, ext_cpuid8);
>>>>>>>> +    //
>>>>>>>> +    // Extended cpuid(0x8000001E)
>>>>>>>> +    //
>>>>>>>> +    __ movl(rax, 0x8000001E);
>>>>>>>> +    __ cpuid();
>>>>>>>> +    __ lea(rsi, Address(rbp,
>>>>>>>> in_bytes(VM_Version::ext_cpuid1E_offset())));
>>>>>>>> +    __ movl(Address(rsi, 0), rax);
>>>>>>>> +    __ movl(Address(rsi, 4), rbx);
>>>>>>>> +    __ movl(Address(rsi, 8), rcx);
>>>>>>>> +    __ movl(Address(rsi,12), rdx);
>>>>>>>> +
>>>>>>>>          //
>>>>>>>>          // Extended cpuid(0x80000008)
>>>>>>>>          //
>>>>>>>> +    __ bind(ext_cpuid8);
>>>>>>>>          __ movl(rax, 0x80000008);
>>>>>>>>          __ cpuid();
>>>>>>>>          __ lea(rsi, Address(rbp,
>>>>>>>> in_bytes(VM_Version::ext_cpuid8_offset())));
>>>>>>>> @@ -1109,11 +1123,27 @@
>>>>>>>>          }
>>>>>>>>
>>>>>>>>      #ifdef COMPILER2
>>>>>>>> -    if (MaxVectorSize > 16) {
>>>>>>>> -      // Limit vectors size to 16 bytes on current AMD cpus.
>>>>>>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>>>>>>> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>>>>>>>>            FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>>>>>>          }
>>>>>>>>      #endif // COMPILER2
>>>>>>>> +
>>>>>>>> +    // Some defaults for AMD family 17h
>>>>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>>>>> +      // On family 17h processors use XMM and UnalignedLoadStores
>>>>>>>> for
>>>>>>>> Array Copy
>>>>>>>> +      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>>>>>>>> +        FLAG_SET_DEFAULT(UseXMMForArrayCopy, true);
>>>>>>>> +      }
>>>>>>>> +      if (supports_sse2() &&
>>>>>>>> FLAG_IS_DEFAULT(UseUnalignedLoadStores))
>>>>>>>> {
>>>>>>>> +        FLAG_SET_DEFAULT(UseUnalignedLoadStores, true);
>>>>>>>> +      }
>>>>>>>> +#ifdef COMPILER2
>>>>>>>> +      if (supports_sse4_2() && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>>>>>>> +        FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>>>>>>> +      }
>>>>>>>> +#endif
>>>>>>>> +    }
>>>>>>>>        }
>>>>>>>>
>>>>>>>>        if( is_intel() ) { // Intel cpus specific settings
>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>> @@ -228,6 +228,15 @@
>>>>>>>>          } bits;
>>>>>>>>        };
>>>>>>>>
>>>>>>>> +  union ExtCpuid1EEx {
>>>>>>>> +    uint32_t value;
>>>>>>>> +    struct {
>>>>>>>> +      uint32_t                  : 8,
>>>>>>>> +               threads_per_core : 8,
>>>>>>>> +                                : 16;
>>>>>>>> +    } bits;
>>>>>>>> +  };
>>>>>>>> +
>>>>>>>>        union XemXcr0Eax {
>>>>>>>>          uint32_t value;
>>>>>>>>          struct {
>>>>>>>> @@ -398,6 +407,12 @@
>>>>>>>>          ExtCpuid8Ecx ext_cpuid8_ecx;
>>>>>>>>          uint32_t     ext_cpuid8_edx; // reserved
>>>>>>>>
>>>>>>>> +    // cpuid function 0x8000001E // AMD 17h
>>>>>>>> +    uint32_t     ext_cpuid1E_eax;
>>>>>>>> +    ExtCpuid1EEx ext_cpuid1E_ebx; // threads per core (AMD17h)
>>>>>>>> +    uint32_t     ext_cpuid1E_ecx;
>>>>>>>> +    uint32_t     ext_cpuid1E_edx; // unused currently
>>>>>>>> +
>>>>>>>>          // extended control register XCR0 (the XFEATURE_ENABLED_MASK
>>>>>>>> register)
>>>>>>>>          XemXcr0Eax   xem_xcr0_eax;
>>>>>>>>          uint32_t     xem_xcr0_edx; // reserved
>>>>>>>> @@ -505,6 +520,14 @@
>>>>>>>>            result |= CPU_CLMUL;
>>>>>>>>          if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>>>>>>>>            result |= CPU_RTM;
>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>> +       result |= CPU_ADX;
>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>> +      result |= CPU_BMI2;
>>>>>>>> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>> +      result |= CPU_SHA;
>>>>>>>> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>> +      result |= CPU_FMA;
>>>>>>>>
>>>>>>>>          // AMD features.
>>>>>>>>          if (is_amd()) {
>>>>>>>> @@ -518,16 +541,8 @@
>>>>>>>>          }
>>>>>>>>          // Intel features.
>>>>>>>>          if(is_intel()) {
>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>> -         result |= CPU_ADX;
>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>> -        result |= CPU_BMI2;
>>>>>>>> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>> -        result |= CPU_SHA;
>>>>>>>>            if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
>>>>>>>>              result |= CPU_LZCNT;
>>>>>>>> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>> -        result |= CPU_FMA;
>>>>>>>>            // for Intel, ecx.bits.misalignsse bit (bit 8) indicates
>>>>>>>> support for prefetchw
>>>>>>>>            if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
>>>>>>>>              result |= CPU_3DNOW_PREFETCH;
>>>>>>>> @@ -590,6 +605,7 @@
>>>>>>>>        static ByteSize ext_cpuid5_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
>>>>>>>>        static ByteSize ext_cpuid7_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
>>>>>>>>        static ByteSize ext_cpuid8_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, ext_cpuid8_eax); }
>>>>>>>> +  static ByteSize ext_cpuid1E_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, ext_cpuid1E_eax); }
>>>>>>>>        static ByteSize tpl_cpuidB0_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
>>>>>>>>        static ByteSize tpl_cpuidB1_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
>>>>>>>>        static ByteSize tpl_cpuidB2_offset() { return
>>>>>>>> byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
>>>>>>>> @@ -673,8 +689,11 @@
>>>>>>>>          if (is_intel() && supports_processor_topology()) {
>>>>>>>>            result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus;
>>>>>>>>          } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) {
>>>>>>>> -      result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
>>>>>>>> -               cores_per_cpu();
>>>>>>>> +      if (cpu_family() >= 0x17)
>>>>>>>> +        result = _cpuid_info.ext_cpuid1E_ebx.bits.threads_per_core +
>>>>>>>> 1;
>>>>>>>> +      else
>>>>>>>> +        result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu /
>>>>>>>> +                 cores_per_cpu();
>>>>>>>>          }
>>>>>>>>          return (result == 0 ? 1 : result);
>>>>>>>>        }
>>>>>>>>
>>>>>>>> I have attached the patch for review.
>>>>>>>> Please let me know your comments.
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Rohit
>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Vladimir
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>
>>>>>>>>>> No comments on AMD specific changes.
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>> David
>>>>>>>>>> -----
>>>>>>>>>>
>>>>>>>>>> On 5/09/2017 3:43 PM, David Holmes wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 5/09/2017 3:29 PM, Rohit Arul Raj wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Hello David,
>>>>>>>>>>>>
>>>>>>>>>>>> On Tue, Sep 5, 2017 at 10:31 AM, David Holmes
>>>>>>>>>>>> <david.holmes at oracle.com>
>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Hi Rohit,
>>>>>>>>>>>>>
>>>>>>>>>>>>> I was unable to apply your patch to latest jdk10/hs/hotspot
>>>>>>>>>>>>> repo.
>>>>>>>>>>>>>
>>>>>>>>>>>> I checked out the latest jdk10/hs/hotspot [parent:
>>>>>>>>>>>> 13548:1a9c2e07a826]
>>>>>>>>>>>> and was able to apply the patch
>>>>>>>>>>>> [epyc-amd17h-defaults-3Sept.patch]
>>>>>>>>>>>> without any issues.
>>>>>>>>>>>> Can you share the error message that you are getting?
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I was getting this:
>>>>>>>>>>>
>>>>>>>>>>> applying hotspot.patch
>>>>>>>>>>> patching file src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>> Hunk #1 FAILED at 1108
>>>>>>>>>>> 1 out of 1 hunks FAILED -- saving rejects to file
>>>>>>>>>>> src/cpu/x86/vm/vm_version_x86.cpp.rej
>>>>>>>>>>> patching file src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>> Hunk #2 FAILED at 522
>>>>>>>>>>> 1 out of 2 hunks FAILED -- saving rejects to file
>>>>>>>>>>> src/cpu/x86/vm/vm_version_x86.hpp.rej
>>>>>>>>>>> abort: patch failed to apply
>>>>>>>>>>>
>>>>>>>>>>> but I started again and this time it applied fine, so not sure
>>>>>>>>>>> what
>>>>>>>>>>> was
>>>>>>>>>>> going on there.
>>>>>>>>>>>
>>>>>>>>>>> Cheers,
>>>>>>>>>>> David
>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Rohit
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 4/09/2017 2:42 AM, Rohit Arul Raj wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Hello Vladimir,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On Sat, Sep 2, 2017 at 11:25 PM, Vladimir Kozlov
>>>>>>>>>>>>>> <vladimir.kozlov at oracle.com> wrote:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Hi Rohit,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 9/2/17 1:16 AM, Rohit Arul Raj wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Hello Vladimir,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Changes look good. Only question I have is about
>>>>>>>>>>>>>>>>> MaxVectorSize.
>>>>>>>>>>>>>>>>> It
>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>> set
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> 16 only in presence of AVX:
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> http://hg.openjdk.java.net/jdk10/hs/hotspot/file/046eab27258f/src/cpu/x86/vm/vm_version_x86.cpp#l945
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Does that code works for AMD 17h too?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Thanks for pointing that out. Yes, the code works fine for
>>>>>>>>>>>>>>>> AMD
>>>>>>>>>>>>>>>> 17h.
>>>>>>>>>>>>>>>> So
>>>>>>>>>>>>>>>> I have removed the surplus check for MaxVectorSize from my
>>>>>>>>>>>>>>>> patch.
>>>>>>>>>>>>>>>> I
>>>>>>>>>>>>>>>> have updated, re-tested and attached the patch.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Which check you removed?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>> My older patch had the below mentioned check which was required
>>>>>>>>>>>>>> on
>>>>>>>>>>>>>> JDK9 where the default MaxVectorSize was 64. It has been
>>>>>>>>>>>>>> handled
>>>>>>>>>>>>>> better in openJDK10. So this check is not required anymore.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +    // Some defaults for AMD family 17h
>>>>>>>>>>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>>>>>>>>>>> ...
>>>>>>>>>>>>>> ...
>>>>>>>>>>>>>> +      if (MaxVectorSize > 32) {
>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(MaxVectorSize, 32);
>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>> ..
>>>>>>>>>>>>>> ..
>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I have one query regarding the setting of UseSHA flag:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> http://hg.openjdk.java.net/jdk10/hs/hotspot/file/046eab27258f/src/cpu/x86/vm/vm_version_x86.cpp#l821
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> AMD 17h has support for SHA.
>>>>>>>>>>>>>>>> AMD 15h doesn't have  support for SHA. Still "UseSHA" flag
>>>>>>>>>>>>>>>> gets
>>>>>>>>>>>>>>>> enabled for it based on the availability of BMI2 and AVX2. Is
>>>>>>>>>>>>>>>> there
>>>>>>>>>>>>>>>> an
>>>>>>>>>>>>>>>> underlying reason for this? I have handled this in the patch
>>>>>>>>>>>>>>>> but
>>>>>>>>>>>>>>>> just
>>>>>>>>>>>>>>>> wanted to confirm.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> It was done with next changes which use only AVX2 and BMI2
>>>>>>>>>>>>>>> instructions
>>>>>>>>>>>>>>> to
>>>>>>>>>>>>>>> calculate SHA-256:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> http://hg.openjdk.java.net/jdk10/hs/hotspot/rev/6a17c49de974
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I don't know if AMD 15h supports these instructions and can
>>>>>>>>>>>>>>> execute
>>>>>>>>>>>>>>> that
>>>>>>>>>>>>>>> code. You need to test it.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Ok, got it. Since AMD15h has support for AVX2 and BMI2
>>>>>>>>>>>>>> instructions,
>>>>>>>>>>>>>> it should work.
>>>>>>>>>>>>>> Confirmed by running following sanity tests:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> ./hotspot/test/compiler/intrinsics/sha/sanity/TestSHA1Intrinsics.java
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> ./hotspot/test/compiler/intrinsics/sha/sanity/TestSHA512Intrinsics.java
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> ./hotspot/test/compiler/intrinsics/sha/sanity/TestSHA256Intrinsics.java
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> So I have removed those SHA checks from my patch too.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Please find attached updated, re-tested patch.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>> @@ -1109,11 +1109,27 @@
>>>>>>>>>>>>>>            }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>        #ifdef COMPILER2
>>>>>>>>>>>>>> -    if (MaxVectorSize > 16) {
>>>>>>>>>>>>>> -      // Limit vectors size to 16 bytes on current AMD cpus.
>>>>>>>>>>>>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>>>>>>>>>>>>> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>>>>>>>>>>>>>>              FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>>>>>>>>>>>>            }
>>>>>>>>>>>>>>        #endif // COMPILER2
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    // Some defaults for AMD family 17h
>>>>>>>>>>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>>>>>>>>>>> +      // On family 17h processors use XMM and
>>>>>>>>>>>>>> UnalignedLoadStores
>>>>>>>>>>>>>> for
>>>>>>>>>>>>>> Array Copy
>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseXMMForArrayCopy, true);
>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseUnalignedLoadStores))
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseUnalignedLoadStores, true);
>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>> +#ifdef COMPILER2
>>>>>>>>>>>>>> +      if (supports_sse4_2() &&
>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseFPUForSpilling))
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>> +#endif
>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>          }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>          if( is_intel() ) { // Intel cpus specific settings
>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>> @@ -505,6 +505,14 @@
>>>>>>>>>>>>>>              result |= CPU_CLMUL;
>>>>>>>>>>>>>>            if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>>>>>>>>>>>>>>              result |= CPU_RTM;
>>>>>>>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>> +       result |= CPU_ADX;
>>>>>>>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>> +      result |= CPU_BMI2;
>>>>>>>>>>>>>> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>> +      result |= CPU_SHA;
>>>>>>>>>>>>>> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>> +      result |= CPU_FMA;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>            // AMD features.
>>>>>>>>>>>>>>            if (is_amd()) {
>>>>>>>>>>>>>> @@ -515,19 +523,13 @@
>>>>>>>>>>>>>>                result |= CPU_LZCNT;
>>>>>>>>>>>>>>              if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
>>>>>>>>>>>>>>                result |= CPU_SSE4A;
>>>>>>>>>>>>>> +      if(_cpuid_info.std_cpuid1_edx.bits.ht != 0)
>>>>>>>>>>>>>> +        result |= CPU_HT;
>>>>>>>>>>>>>>            }
>>>>>>>>>>>>>>            // Intel features.
>>>>>>>>>>>>>>            if(is_intel()) {
>>>>>>>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>> -         result |= CPU_ADX;
>>>>>>>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>> -        result |= CPU_BMI2;
>>>>>>>>>>>>>> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>> -        result |= CPU_SHA;
>>>>>>>>>>>>>>              if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel !=
>>>>>>>>>>>>>> 0)
>>>>>>>>>>>>>>                result |= CPU_LZCNT;
>>>>>>>>>>>>>> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>> -        result |= CPU_FMA;
>>>>>>>>>>>>>>              // for Intel, ecx.bits.misalignsse bit (bit 8)
>>>>>>>>>>>>>> indicates
>>>>>>>>>>>>>> support for prefetchw
>>>>>>>>>>>>>>              if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse !=
>>>>>>>>>>>>>> 0)
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>                result |= CPU_3DNOW_PREFETCH;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Please let me know your comments.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks for your time.
>>>>>>>>>>>>>> Rohit
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Thanks for taking time to review the code.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>> @@ -1088,6 +1088,22 @@
>>>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>>>               FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
>>>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>>> +    if (supports_sha()) {
>>>>>>>>>>>>>>>> +      if (FLAG_IS_DEFAULT(UseSHA)) {
>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseSHA, true);
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +    } else if (UseSHA || UseSHA1Intrinsics ||
>>>>>>>>>>>>>>>> UseSHA256Intrinsics
>>>>>>>>>>>>>>>> ||
>>>>>>>>>>>>>>>> UseSHA512Intrinsics) {
>>>>>>>>>>>>>>>> +      if (!FLAG_IS_DEFAULT(UseSHA) ||
>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA1Intrinsics) ||
>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA256Intrinsics) ||
>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
>>>>>>>>>>>>>>>> +        warning("SHA instructions are not available on this
>>>>>>>>>>>>>>>> CPU");
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA, false);
>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>             // some defaults for AMD family 15h
>>>>>>>>>>>>>>>>             if ( cpu_family() == 0x15 ) {
>>>>>>>>>>>>>>>> @@ -1109,11 +1125,40 @@
>>>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>         #ifdef COMPILER2
>>>>>>>>>>>>>>>> -    if (MaxVectorSize > 16) {
>>>>>>>>>>>>>>>> -      // Limit vectors size to 16 bytes on current AMD cpus.
>>>>>>>>>>>>>>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>>>>>>>>>>>>>>> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>>>>>>>>>>>>>>>>               FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>>>         #endif // COMPILER2
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +    // Some defaults for AMD family 17h
>>>>>>>>>>>>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>>>>>>>>>>>>> +      // On family 17h processors use XMM and
>>>>>>>>>>>>>>>> UnalignedLoadStores
>>>>>>>>>>>>>>>> for
>>>>>>>>>>>>>>>> Array Copy
>>>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseXMMForArrayCopy))
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseXMMForArrayCopy, true);
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseUnalignedLoadStores, true);
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +      if (supports_bmi2() &&
>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseBMI2Instructions))
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseBMI2Instructions, true);
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +      if (UseSHA) {
>>>>>>>>>>>>>>>> +        if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>> +        } else if (UseSHA512Intrinsics) {
>>>>>>>>>>>>>>>> +          warning("Intrinsics for SHA-384 and SHA-512 crypto
>>>>>>>>>>>>>>>> hash
>>>>>>>>>>>>>>>> functions not available on this CPU.");
>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>> +        }
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +#ifdef COMPILER2
>>>>>>>>>>>>>>>> +      if (supports_sse4_2()) {
>>>>>>>>>>>>>>>> +        if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>>>>>>>>>>>>>>> +        }
>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>> +#endif
>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>           }
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>           if( is_intel() ) { // Intel cpus specific settings
>>>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>> @@ -505,6 +505,14 @@
>>>>>>>>>>>>>>>>               result |= CPU_CLMUL;
>>>>>>>>>>>>>>>>             if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>>>>>>>>>>>>>>>>               result |= CPU_RTM;
>>>>>>>>>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>>>> +       result |= CPU_ADX;
>>>>>>>>>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>>>> +      result |= CPU_BMI2;
>>>>>>>>>>>>>>>> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>>>> +      result |= CPU_SHA;
>>>>>>>>>>>>>>>> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>>>> +      result |= CPU_FMA;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>             // AMD features.
>>>>>>>>>>>>>>>>             if (is_amd()) {
>>>>>>>>>>>>>>>> @@ -515,19 +523,13 @@
>>>>>>>>>>>>>>>>                 result |= CPU_LZCNT;
>>>>>>>>>>>>>>>>               if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
>>>>>>>>>>>>>>>>                 result |= CPU_SSE4A;
>>>>>>>>>>>>>>>> +      if(_cpuid_info.std_cpuid1_edx.bits.ht != 0)
>>>>>>>>>>>>>>>> +        result |= CPU_HT;
>>>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>>>             // Intel features.
>>>>>>>>>>>>>>>>             if(is_intel()) {
>>>>>>>>>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>>>> -         result |= CPU_ADX;
>>>>>>>>>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>>>> -        result |= CPU_BMI2;
>>>>>>>>>>>>>>>> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>>>> -        result |= CPU_SHA;
>>>>>>>>>>>>>>>>               if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel
>>>>>>>>>>>>>>>> !=
>>>>>>>>>>>>>>>> 0)
>>>>>>>>>>>>>>>>                 result |= CPU_LZCNT;
>>>>>>>>>>>>>>>> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>>>> -        result |= CPU_FMA;
>>>>>>>>>>>>>>>>               // for Intel, ecx.bits.misalignsse bit (bit 8)
>>>>>>>>>>>>>>>> indicates
>>>>>>>>>>>>>>>> support for prefetchw
>>>>>>>>>>>>>>>>               if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse
>>>>>>>>>>>>>>>> !=
>>>>>>>>>>>>>>>> 0) {
>>>>>>>>>>>>>>>>                 result |= CPU_3DNOW_PREFETCH;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Rohit
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> On 9/1/17 8:04 AM, Rohit Arul Raj wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> On Fri, Sep 1, 2017 at 10:27 AM, Rohit Arul Raj
>>>>>>>>>>>>>>>>>> <rohitarulraj at gmail.com>
>>>>>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> On Fri, Sep 1, 2017 at 3:01 AM, David Holmes
>>>>>>>>>>>>>>>>>>> <david.holmes at oracle.com>
>>>>>>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Hi Rohit,
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> I think the patch needs updating for jdk10 as I already
>>>>>>>>>>>>>>>>>>>> see
>>>>>>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>>>>>>>> lot of
>>>>>>>>>>>>>>>>>>>> logic
>>>>>>>>>>>>>>>>>>>> around UseSHA in vm_version_x86.cpp.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>>>>>>> David
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Thanks David, I will update the patch wrt JDK10 source
>>>>>>>>>>>>>>>>>>> base,
>>>>>>>>>>>>>>>>>>> test
>>>>>>>>>>>>>>>>>>> and
>>>>>>>>>>>>>>>>>>> resubmit for review.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Rohit
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Hi All,
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I have updated the patch wrt openjdk10/hotspot (parent:
>>>>>>>>>>>>>>>>>> 13519:71337910df60), did regression testing using jtreg
>>>>>>>>>>>>>>>>>> ($make
>>>>>>>>>>>>>>>>>> default) and didnt find any regressions.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Can anyone please volunteer to review this patch  which
>>>>>>>>>>>>>>>>>> sets
>>>>>>>>>>>>>>>>>> flag/ISA
>>>>>>>>>>>>>>>>>> defaults for newer AMD 17h (EPYC) processor?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> ************************* Patch
>>>>>>>>>>>>>>>>>> ****************************
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>> @@ -1088,6 +1088,22 @@
>>>>>>>>>>>>>>>>>>                }
>>>>>>>>>>>>>>>>>>                FLAG_SET_DEFAULT(UseSSE42Intrinsics, false);
>>>>>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>>>>> +    if (supports_sha()) {
>>>>>>>>>>>>>>>>>> +      if (FLAG_IS_DEFAULT(UseSHA)) {
>>>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseSHA, true);
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +    } else if (UseSHA || UseSHA1Intrinsics ||
>>>>>>>>>>>>>>>>>> UseSHA256Intrinsics
>>>>>>>>>>>>>>>>>> ||
>>>>>>>>>>>>>>>>>> UseSHA512Intrinsics) {
>>>>>>>>>>>>>>>>>> +      if (!FLAG_IS_DEFAULT(UseSHA) ||
>>>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA1Intrinsics) ||
>>>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA256Intrinsics) ||
>>>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
>>>>>>>>>>>>>>>>>> +        warning("SHA instructions are not available on
>>>>>>>>>>>>>>>>>> this
>>>>>>>>>>>>>>>>>> CPU");
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA, false);
>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>              // some defaults for AMD family 15h
>>>>>>>>>>>>>>>>>>              if ( cpu_family() == 0x15 ) {
>>>>>>>>>>>>>>>>>> @@ -1109,11 +1125,43 @@
>>>>>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>          #ifdef COMPILER2
>>>>>>>>>>>>>>>>>> -    if (MaxVectorSize > 16) {
>>>>>>>>>>>>>>>>>> -      // Limit vectors size to 16 bytes on current AMD
>>>>>>>>>>>>>>>>>> cpus.
>>>>>>>>>>>>>>>>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>>>>>>>>>>>>>>>>> +      // Limit vectors size to 16 bytes on AMD cpus < 17h.
>>>>>>>>>>>>>>>>>>                FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>>>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>>>>>          #endif // COMPILER2
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +    // Some defaults for AMD family 17h
>>>>>>>>>>>>>>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>>>>>>>>>>>>>>> +      // On family 17h processors use XMM and
>>>>>>>>>>>>>>>>>> UnalignedLoadStores
>>>>>>>>>>>>>>>>>> for
>>>>>>>>>>>>>>>>>> Array Copy
>>>>>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseXMMForArrayCopy))
>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>> +        UseXMMForArrayCopy = true;
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseUnalignedLoadStores))
>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>> +        UseUnalignedLoadStores = true;
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +      if (supports_bmi2() &&
>>>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseBMI2Instructions)) {
>>>>>>>>>>>>>>>>>> +        UseBMI2Instructions = true;
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +      if (MaxVectorSize > 32) {
>>>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(MaxVectorSize, 32);
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +      if (UseSHA) {
>>>>>>>>>>>>>>>>>> +        if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
>>>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>>>> +        } else if (UseSHA512Intrinsics) {
>>>>>>>>>>>>>>>>>> +          warning("Intrinsics for SHA-384 and SHA-512
>>>>>>>>>>>>>>>>>> crypto
>>>>>>>>>>>>>>>>>> hash
>>>>>>>>>>>>>>>>>> functions not available on this CPU.");
>>>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>>>> +        }
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +#ifdef COMPILER2
>>>>>>>>>>>>>>>>>> +      if (supports_sse4_2()) {
>>>>>>>>>>>>>>>>>> +        if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>>>>>>>>>>>>>>>>> +        }
>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>> +#endif
>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>            }
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>            if( is_intel() ) { // Intel cpus specific
>>>>>>>>>>>>>>>>>> settings
>>>>>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>> @@ -505,6 +505,14 @@
>>>>>>>>>>>>>>>>>>                result |= CPU_CLMUL;
>>>>>>>>>>>>>>>>>>              if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
>>>>>>>>>>>>>>>>>>                result |= CPU_RTM;
>>>>>>>>>>>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>>>>>> +       result |= CPU_ADX;
>>>>>>>>>>>>>>>>>> +    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>>>>>> +      result |= CPU_BMI2;
>>>>>>>>>>>>>>>>>> +    if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>>>>>> +      result |= CPU_SHA;
>>>>>>>>>>>>>>>>>> +    if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>>>>>> +      result |= CPU_FMA;
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>              // AMD features.
>>>>>>>>>>>>>>>>>>              if (is_amd()) {
>>>>>>>>>>>>>>>>>> @@ -515,19 +523,13 @@
>>>>>>>>>>>>>>>>>>                  result |= CPU_LZCNT;
>>>>>>>>>>>>>>>>>>                if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a !=
>>>>>>>>>>>>>>>>>> 0)
>>>>>>>>>>>>>>>>>>                  result |= CPU_SSE4A;
>>>>>>>>>>>>>>>>>> +      if(_cpuid_info.std_cpuid1_edx.bits.ht != 0)
>>>>>>>>>>>>>>>>>> +        result |= CPU_HT;
>>>>>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>>>>>              // Intel features.
>>>>>>>>>>>>>>>>>>              if(is_intel()) {
>>>>>>>>>>>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>>>>>> -         result |= CPU_ADX;
>>>>>>>>>>>>>>>>>> -      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>>>>>> -        result |= CPU_BMI2;
>>>>>>>>>>>>>>>>>> -      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>>>>>> -        result |= CPU_SHA;
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel
>>>>>>>>>>>>>>>>>> != 0)
>>>>>>>>>>>>>>>>>>                  result |= CPU_LZCNT;
>>>>>>>>>>>>>>>>>> -      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>>>>>> -        result |= CPU_FMA;
>>>>>>>>>>>>>>>>>>                // for Intel, ecx.bits.misalignsse bit (bit
>>>>>>>>>>>>>>>>>> 8)
>>>>>>>>>>>>>>>>>> indicates
>>>>>>>>>>>>>>>>>> support for prefetchw
>>>>>>>>>>>>>>>>>>                if
>>>>>>>>>>>>>>>>>> (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse
>>>>>>>>>>>>>>>>>> !=
>>>>>>>>>>>>>>>>>> 0) {
>>>>>>>>>>>>>>>>>>                  result |= CPU_3DNOW_PREFETCH;
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> **************************************************************
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>>>>> Rohit
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> On 1/09/2017 1:11 AM, Rohit Arul Raj wrote:
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> On Thu, Aug 31, 2017 at 5:59 PM, David Holmes
>>>>>>>>>>>>>>>>>>>>> <david.holmes at oracle.com>
>>>>>>>>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Hi Rohit,
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> On 31/08/2017 7:03 PM, Rohit Arul Raj wrote:
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> I would like an volunteer to review this patch
>>>>>>>>>>>>>>>>>>>>>>> (openJDK9)
>>>>>>>>>>>>>>>>>>>>>>> which
>>>>>>>>>>>>>>>>>>>>>>> sets
>>>>>>>>>>>>>>>>>>>>>>> flag/ISA defaults for newer AMD 17h (EPYC) processor
>>>>>>>>>>>>>>>>>>>>>>> and
>>>>>>>>>>>>>>>>>>>>>>> help
>>>>>>>>>>>>>>>>>>>>>>> us
>>>>>>>>>>>>>>>>>>>>>>> with
>>>>>>>>>>>>>>>>>>>>>>> the commit process.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Webrev:
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> https://www.dropbox.com/sh/08bsxaxupg8kbam/AADurTXLGIZ6C-tiIAi_Glyka?dl=0
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Unfortunately patches can not be accepted from systems
>>>>>>>>>>>>>>>>>>>>>> outside
>>>>>>>>>>>>>>>>>>>>>> the
>>>>>>>>>>>>>>>>>>>>>> OpenJDK
>>>>>>>>>>>>>>>>>>>>>> infrastructure and ...
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> I have also attached the patch (hg diff -g) for
>>>>>>>>>>>>>>>>>>>>>>> reference.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> ... unfortunately patches tend to get stripped by the
>>>>>>>>>>>>>>>>>>>>>> mail
>>>>>>>>>>>>>>>>>>>>>> servers.
>>>>>>>>>>>>>>>>>>>>>> If
>>>>>>>>>>>>>>>>>>>>>> the
>>>>>>>>>>>>>>>>>>>>>> patch is small please include it inline. Otherwise you
>>>>>>>>>>>>>>>>>>>>>> will
>>>>>>>>>>>>>>>>>>>>>> need
>>>>>>>>>>>>>>>>>>>>>> to
>>>>>>>>>>>>>>>>>>>>>> find
>>>>>>>>>>>>>>>>>>>>>> an
>>>>>>>>>>>>>>>>>>>>>> OpenJDK Author who can host it for you on
>>>>>>>>>>>>>>>>>>>>>> cr.openjdk.java.net.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> 3) I have done regression testing using jtreg ($make
>>>>>>>>>>>>>>>>>>>>>>> default)
>>>>>>>>>>>>>>>>>>>>>>> and
>>>>>>>>>>>>>>>>>>>>>>> didnt find any regressions.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Sounds good, but until I see the patch it is hard to
>>>>>>>>>>>>>>>>>>>>>> comment
>>>>>>>>>>>>>>>>>>>>>> on
>>>>>>>>>>>>>>>>>>>>>> testing
>>>>>>>>>>>>>>>>>>>>>> requirements.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>>>>>>>>> David
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Thanks David,
>>>>>>>>>>>>>>>>>>>>> Yes, it's a small patch.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.cpp
>>>>>>>>>>>>>>>>>>>>> @@ -1051,6 +1051,22 @@
>>>>>>>>>>>>>>>>>>>>>                 }
>>>>>>>>>>>>>>>>>>>>>                 FLAG_SET_DEFAULT(UseSSE42Intrinsics,
>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>>>>>>>> +    if (supports_sha()) {
>>>>>>>>>>>>>>>>>>>>> +      if (FLAG_IS_DEFAULT(UseSHA)) {
>>>>>>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(UseSHA, true);
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +    } else if (UseSHA || UseSHA1Intrinsics ||
>>>>>>>>>>>>>>>>>>>>> UseSHA256Intrinsics
>>>>>>>>>>>>>>>>>>>>> ||
>>>>>>>>>>>>>>>>>>>>> UseSHA512Intrinsics) {
>>>>>>>>>>>>>>>>>>>>> +      if (!FLAG_IS_DEFAULT(UseSHA) ||
>>>>>>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA1Intrinsics) ||
>>>>>>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA256Intrinsics) ||
>>>>>>>>>>>>>>>>>>>>> +          !FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
>>>>>>>>>>>>>>>>>>>>> +        warning("SHA instructions are not available on
>>>>>>>>>>>>>>>>>>>>> this
>>>>>>>>>>>>>>>>>>>>> CPU");
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA, false);
>>>>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
>>>>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
>>>>>>>>>>>>>>>>>>>>> +      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>               // some defaults for AMD family 15h
>>>>>>>>>>>>>>>>>>>>>               if ( cpu_family() == 0x15 ) {
>>>>>>>>>>>>>>>>>>>>> @@ -1072,11 +1088,43 @@
>>>>>>>>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>           #ifdef COMPILER2
>>>>>>>>>>>>>>>>>>>>> -    if (MaxVectorSize > 16) {
>>>>>>>>>>>>>>>>>>>>> -      // Limit vectors size to 16 bytes on current AMD
>>>>>>>>>>>>>>>>>>>>> cpus.
>>>>>>>>>>>>>>>>>>>>> +    if (cpu_family() < 0x17 && MaxVectorSize > 16) {
>>>>>>>>>>>>>>>>>>>>> +      // Limit vectors size to 16 bytes on AMD cpus <
>>>>>>>>>>>>>>>>>>>>> 17h.
>>>>>>>>>>>>>>>>>>>>>                 FLAG_SET_DEFAULT(MaxVectorSize, 16);
>>>>>>>>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>>>>>>>>           #endif // COMPILER2
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    // Some defaults for AMD family 17h
>>>>>>>>>>>>>>>>>>>>> +    if ( cpu_family() == 0x17 ) {
>>>>>>>>>>>>>>>>>>>>> +      // On family 17h processors use XMM and
>>>>>>>>>>>>>>>>>>>>> UnalignedLoadStores
>>>>>>>>>>>>>>>>>>>>> for
>>>>>>>>>>>>>>>>>>>>> Array Copy
>>>>>>>>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseXMMForArrayCopy))
>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>> +        UseXMMForArrayCopy = true;
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +      if (supports_sse2() &&
>>>>>>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseUnalignedLoadStores))
>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>> +        UseUnalignedLoadStores = true;
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +      if (supports_bmi2() &&
>>>>>>>>>>>>>>>>>>>>> FLAG_IS_DEFAULT(UseBMI2Instructions))
>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>> +        UseBMI2Instructions = true;
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +      if (MaxVectorSize > 32) {
>>>>>>>>>>>>>>>>>>>>> +        FLAG_SET_DEFAULT(MaxVectorSize, 32);
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +      if (UseSHA) {
>>>>>>>>>>>>>>>>>>>>> +        if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
>>>>>>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>>>>>>> +        } else if (UseSHA512Intrinsics) {
>>>>>>>>>>>>>>>>>>>>> +          warning("Intrinsics for SHA-384 and SHA-512
>>>>>>>>>>>>>>>>>>>>> crypto
>>>>>>>>>>>>>>>>>>>>> hash
>>>>>>>>>>>>>>>>>>>>> functions not available on this CPU.");
>>>>>>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
>>>>>>>>>>>>>>>>>>>>> +        }
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +#ifdef COMPILER2
>>>>>>>>>>>>>>>>>>>>> +      if (supports_sse4_2()) {
>>>>>>>>>>>>>>>>>>>>> +        if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
>>>>>>>>>>>>>>>>>>>>> +          FLAG_SET_DEFAULT(UseFPUForSpilling, true);
>>>>>>>>>>>>>>>>>>>>> +        }
>>>>>>>>>>>>>>>>>>>>> +      }
>>>>>>>>>>>>>>>>>>>>> +#endif
>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>             if( is_intel() ) { // Intel cpus specific
>>>>>>>>>>>>>>>>>>>>> settings
>>>>>>>>>>>>>>>>>>>>> diff --git a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>>>>> b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>>>>> --- a/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>>>>> +++ b/src/cpu/x86/vm/vm_version_x86.hpp
>>>>>>>>>>>>>>>>>>>>> @@ -513,6 +513,16 @@
>>>>>>>>>>>>>>>>>>>>>                   result |= CPU_LZCNT;
>>>>>>>>>>>>>>>>>>>>>                 if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a
>>>>>>>>>>>>>>>>>>>>> !=
>>>>>>>>>>>>>>>>>>>>> 0)
>>>>>>>>>>>>>>>>>>>>>                   result |= CPU_SSE4A;
>>>>>>>>>>>>>>>>>>>>> +      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
>>>>>>>>>>>>>>>>>>>>> +        result |= CPU_BMI2;
>>>>>>>>>>>>>>>>>>>>> +      if(_cpuid_info.std_cpuid1_edx.bits.ht != 0)
>>>>>>>>>>>>>>>>>>>>> +        result |= CPU_HT;
>>>>>>>>>>>>>>>>>>>>> +      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
>>>>>>>>>>>>>>>>>>>>> +        result |= CPU_ADX;
>>>>>>>>>>>>>>>>>>>>> +      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
>>>>>>>>>>>>>>>>>>>>> +        result |= CPU_SHA;
>>>>>>>>>>>>>>>>>>>>> +      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
>>>>>>>>>>>>>>>>>>>>> +        result |= CPU_FMA;
>>>>>>>>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>>>>>>>>               // Intel features.
>>>>>>>>>>>>>>>>>>>>>               if(is_intel()) {
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Rohit
>>>>>>>>>>>>>>>>>>>>>