[aarch64-port-dev ] RFR: AARCH64: GHash implementation for aarch64
Alexeev, Alexander
Alexander.Alexeev at caviumnetworks.com
Tue Jul 21 12:13:34 UTC 2015
Hello
Ghash calculation is a bit updated
http://cr.openjdk.java.net/~aalexeev/ghash/webrev.00/
Changes to previous version:
1. multiplication procedure is simplified.
- 3 pmull instructions are used instead of 4.
2. ins instructions is replaced with ext for vector elements swapping.
-XX:-UseGHASHIntrinsics
Benchmark Mode Cnt Score Error Units
GHash.calculateGHash avgt 5 118.688 ? 0.009 us/op
-XX:+UseGHASHIntrinsics
Benchmark Mode Cnt Score Error Units
GHash.calculateGHash avgt 5 21.164 ? 1.763 us/op
----
Java -XX:-UseGHASHIntrinsics -DcheckOutput=true -Dmode=GCM TestAESMain
TestAESEncode runtime was 67567.23319 ms
TestAESDecode runtime was 37724.89769 ms
---
Java -XX:+UseGHASHIntrinsics -DcheckOutput=true -Dmode=GCM TestAESMain
TestAESEncode runtime was 45002.91369 ms (previous version 47184.98697 ms)
TestAESDecode runtime was 17226.41117 ms (previous version 18110.62064 ms)
Tests passed:
JDK, com/sun/crypto/provider/Cipher/AES/TestGHASH.java
Hotspot, compiler/codegen/7184394/TestAESMain.java
Regards,
Alexander
> -----Original Message-----
> From: aarch64-port-dev [mailto:aarch64-port-dev-
> bounces at openjdk.java.net] On Behalf Of Alexeev, Alexander
> Sent: Tuesday, July 14, 2015 11:51 AM
> To: aarch64-port-dev at openjdk.java.net
> Subject: [aarch64-port-dev ] RFR: AARCH64: GHash implementation for
> aarch64
>
> Seems previous message was misformatted.
>
> Hello
>
> I would like to propose the patch (below in the body) with ghash intrinsic for
> aarch64.
>
> On ThunderX speedup for 4k data block ghash calculation ~6 times -XX:-
> UseGHASHIntrinsics
> Benchmark Mode Cnt Score Error Units
> GHash.calculateGHash avgt 5 119.804 ? 1.161 us/op
>
> -XX:+UseGHASHIntrinsics
> Benchmark Mode Cnt Score Error Units
> GHash.calculateGHash avgt 5 21.925 ? 1.505 us/op
>
> ----
> Java -XX:-UseGHASHIntrinsics -DcheckOutput=true -Dmode=GCM
> TestAESMain TestAESEncode runtime was 67567.23319 ms TestAESDecode
> runtime was 37724.89769 ms
> ---
> Java -XX:+UseGHASHIntrinsics -DcheckOutput=true -Dmode=GCM
> TestAESMain TestAESEncode runtime was 47184.98697 ms TestAESDecode
> runtime was 18110.62064 ms
>
>
> Tested with jtreg suites
> Before
> Hotspot
> Test results: passed: 852; failed: 16; error: 7 JDK Test results: passed: 5,655;
> failed: 1,031; error: 17
>
> After
> Hotspot
> Test results: passed: 852; failed: 16; error: 7 JDK Test results: passed: 5,655;
> failed: 1,031; error: 17
>
> Note: direct tests passed
> JDK, com/sun/crypto/provider/Cipher/AES/TestGHASH.java
> Hotspot, compiler/codegen/7184394/TestAESMain.java
>
>
> --- CUT HERE ---
> diff -r 69fad5109885 src/cpu/aarch64/vm/assembler_aarch64.hpp
> --- a/src/cpu/aarch64/vm/assembler_aarch64.hpp Thu Jun 25 13:41:29
> 2015 +0000
> +++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp Mon Jul 13 08:34:20
> 2015 +0000
> @@ -1894,7 +1894,7 @@
> public:
>
> enum SIMD_Arrangement {
> - T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
> + T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D, T1Q
> };
>
> enum SIMD_RegVariant {
> @@ -2223,12 +2223,13 @@
> f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0);
> }
>
> - // We do not handle the 1Q arrangement.
> void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
> FloatRegister Vm, SIMD_Arrangement Tb) {
> starti;
> - assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
> - f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15,
> 10);
> - rf(Vn, 5), rf(Vd, 0);
> + assert((Ta == T1Q && (Tb == T1D || Tb == T2D)) ||
> + (Ta == T8H && (Tb == T8B || Tb == T16B)), "Invalid Size specifier");
> + int size = (Ta == T1Q) ? 0b11 : 0b00;
> + f(0, 31), f(Tb & 1, 30), f(0b001110, 29, 24), f(size, 23, 22);
> + f(1, 21), rf(Vm, 16), f(0b111000, 15, 10), rf(Vn, 5), rf(Vd, 0);
> }
> void pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
> FloatRegister Vm, SIMD_Arrangement Tb) {
> pmull(Vd, Ta, Vn, Vm, Tb);
> @@ -2243,15 +2244,6 @@
> f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0);
> }
>
> - void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
> - {
> - starti;
> - assert(T <= T8H, "must be one of T8B, T16B, T4H, T8H");
> - f(0, 31), f((int)T & 1, 30), f(0b101110, 29, 24);
> - f(T <= T16B ? 0b00 : 0b01, 23, 22), f(0b100000000010, 21, 10);
> - rf(Vn, 5), rf(Vd, 0);
> - }
> -
> void dup(FloatRegister Vd, SIMD_Arrangement T, Register Xs)
> {
> starti;
> @@ -2284,6 +2276,47 @@
>
> #undef INSN
>
> + // Table vector lookup
> +#define INSN(NAME, op) \
> + void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn,
> unsigned registers, FloatRegister Vm) { \
> + starti; \
> + assert(T == T8B || T == T16B, "invalid arrangement");
> \
> + assert(0 < registers && registers <= 4, "invalid number of registers");
> \
> + f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15);
> \
> + f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0);
> \
> + }
> +
> + INSN(tbl, 0);
> + INSN(tbx, 1);
> +
> +#undef INSN
> +
> +#define INSN(NAME, U, opcode) \
> + void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
> \
> + starti; \
> + assert((ASSERTION), MSG); \
> + f(0, 31), f((int)T & 1, 30), f(U, 29), f(0b01110, 28, 24); \
> + f((int)(T >> 1), 23, 22), f(0b10000, 21, 17), f(opcode, 16, 12); \
> + f(0b10, 11, 10), rf(Vn, 5), rf(Vd, 0); \
> + }
> +
> +#define MSG "invalid arrangement"
> +
> +#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H || T
> +== T2S || T == T4S)
> + INSN(rev64, 0, 0b00000);
> +#undef ASSERTION
> +
> +#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H)
> + INSN(rev32, 1, 0b00000);
> +#undef ASSERTION
> +
> +#define ASSERTION (T == T8B || T == T16B)
> + INSN(rev16, 0, 0b00001);
> +#undef ASSERTION
> +
> +#undef MSG
> +
> +#undef INSN
>
> /* Simulator extensions to the ISA
>
> diff -r 69fad5109885 src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
> --- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Jun 25
> 13:41:29 2015 +0000
> +++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Mon Jul 13
> 08:34:20 2015 +0000
> @@ -2396,6 +2396,146 @@
> return start;
> }
>
> + /**
> + * Arguments:
> + *
> + * Input:
> + * c_rarg0 - current state address
> + * c_rarg1 - H key address
> + * c_rarg2 - data address
> + * c_rarg3 - number of blocks
> + *
> + * Output:
> + * Updated state at c_rarg0
> + */
> + address generate_ghash_processBlocks() {
> + __ align(CodeEntryAlignment);
> + Label L_ghash_loop, L_exit;
> +
> + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
> + address start = __ pc();
> +
> + Register state = c_rarg0;
> + Register subkeyH = c_rarg1;
> + Register data = c_rarg2;
> + Register blocks = c_rarg3;
> +
> + FloatRegister vzr = v30;
> + __ eor(vzr, __ T16B, vzr, vzr); // zero register
> +
> + __ mov(v26, __ T16B, 1);
> + __ mov(v27, __ T16B, 63);
> + __ mov(v28, __ T16B, 62);
> + __ mov(v29, __ T16B, 57);
> +
> + __ ldrq(v6, Address(state));
> + __ ldrq(v16, Address(subkeyH));
> +
> + __ ins(v0, __ D, v6, 0, 1);
> + __ ins(v0, __ D, v6, 1, 0);
> +
> + __ ins(v1, __ D, v16, 0, 1);
> +
> + __ bind(L_ghash_loop);
> +
> + __ ldrq(v2, Address(__ post(data, 0x10)));
> + __ rev64(v2, __ T16B, v2); // swap data
> +
> + __ ins(v6, __ D, v0, 0, 1);
> + __ eor(v6, __ T16B, v6, v2);
> + __ ins(v2, __ D, v2, 0, 1);
> + __ eor(v2, __ T16B, v2, v0);
> +
> + __ pmull(v7, __ T1Q, v2, v1, __ T1D); // A0*B0
> + __ pmull(v18, __ T1Q, v2, v16, __ T1D); // A0*B1
> + __ pmull(v19, __ T1Q, v6, v1, __ T1D); // A1*B0
> + __ pmull(v20, __ T1Q, v6, v16, __ T1D); // A1*B1
> +
> + __ eor(v18, __ T16B, v18, v19); // A0*B1 xor A1*B0
> + __ orr(v19, __ T16B, v18, v18);
> +
> + __ ins(v18, __ D, v18, 1, 0); // << 64
> + __ ins(v18, __ D, vzr, 0, 0);
> +
> + __ ins(v19, __ D, v19, 0, 1); // >> 64
> + __ ins(v19, __ D, vzr, 1, 0);
> +
> + // Registers pair <v6:v5> holds the result of carry-less multiplication of
> v2:v6*v1:v16
> + __ eor(v5, __ T16B, v7, v18);
> + __ eor(v6, __ T16B, v20, v19);
> +
> + // Result of the multiplication is shifted by one bit position
> + // [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1
> + __ ushr(v18, __ T2D, v5, -63 & 63);
> + __ ins(v25, __ D, v18, 1, 0);
> + __ ins(v25, __ D, vzr, 0, 0);
> + __ ushl(v5, __ T2D, v5, v26);
> + __ orr(v5, __ T16B, v5, v25);
> +
> + __ ushr(v19, __ T2D, v6, -63 & 63);
> + __ ins(v19, __ D, v19, 1, 0);
> + __ ins(v19, __ D, v18, 0, 1);
> + __ ushl(v6, __ T2D, v6, v26);
> + __ orr(v6, __ T16B, v6, v19);
> +
> + __ ins(v24, __ D, v5, 0, 1);
> +
> + // A = X0 << 63
> + __ ushl(v21, __ T2D, v5, v27);
> +
> + // A = X0 << 62
> + __ ushl(v22, __ T2D, v5, v28);
> +
> + // A = X0 << 57
> + __ ushl(v23, __ T2D, v5, v29);
> +
> + // D = X1^A^B^C
> + __ eor(v21, __ T16B, v21, v22);
> + __ eor(v21, __ T16B, v21, v23);
> + __ eor(v21, __ T16B, v21, v24);
> + __ ins(v21, __ D, v21, 1, 0);
> + __ ins(v21, __ D, v5, 0, 0);
> +
> + // [E1:E0] = [D:X0] >> 1
> + __ ushr(v20, __ T2D, v21, -1 & 63);
> + __ ushl(v18, __ T2D, v21, v27);
> + __ ins(v25, __ D, v18, 0, 1);
> + __ ins(v25, __ D, vzr, 1, 0);
> + __ orr(v19, __ T16B, v20, v25);
> +
> + __ eor(v7, __ T16B, v21, v19);
> +
> + // [F1:F0] = [D:X0] >> 2
> + __ ushr(v20, __ T2D, v21, -2 & 63);
> + __ ushl(v18, __ T2D, v21, v28);
> + __ ins(v25, __ D, v18, 0, 1);
> + __ orr(v19, __ T16B, v20, v25);
> +
> + __ eor(v7, __ T16B, v7, v19);
> +
> + // [G1:G0] = [D:X0] >> 7
> + __ ushr(v20, __ T2D, v21, -7 & 63);
> + __ ushl(v18, __ T2D, v21, v29);
> + __ ins(v25, __ D, v18, 0, 1);
> + __ orr(v19, __ T16B, v20, v25);
> +
> + // [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0]
> + __ eor(v7, __ T16B, v7, v19);
> +
> + // Result = [H1:H0]^[X3:X2]
> + __ eor(v0, __ T16B, v7, v6);
> +
> + __ subs(blocks, blocks, 1);
> + __ cbnz(blocks, L_ghash_loop);
> +
> + __ ins(v1, __ D, v0, 1, 0);
> + __ ins(v1, __ D, v0, 0, 1);
> + __ st1(v1, __ T16B, state);
> + __ ret(lr);
> +
> + return start;
> + }
> +
> // Continuation point for throwing of implicit exceptions that are
> // not handled in the current activation. Fabricates an exception
> // oop and initiates normal exception dispatching in this @@ -2563,6
> +2703,11 @@
> }
>
> #ifndef BUILTIN_SIM
> + // generate GHASH intrinsics code
> + if (UseGHASHIntrinsics) {
> + StubRoutines::_ghash_processBlocks =
> generate_ghash_processBlocks();
> + }
> +
> if (UseAESIntrinsics) {
> StubRoutines::_aescrypt_encryptBlock =
> generate_aescrypt_encryptBlock();
> StubRoutines::_aescrypt_decryptBlock =
> generate_aescrypt_decryptBlock(); diff -r 69fad5109885
> src/cpu/aarch64/vm/vm_version_aarch64.cpp
> --- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp Thu Jun 25 13:41:29
> 2015 +0000
> +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp Mon Jul 13 08:34:20
> +++ 2015 +0000
> @@ -45,6 +45,10 @@
> #define HWCAP_AES (1<<3)
> #endif
>
> +#ifndef HWCAP_PMULL
> +#define HWCAP_PMULL (1<<4)
> +#endif
> +
> #ifndef HWCAP_SHA1
> #define HWCAP_SHA1 (1<<5)
> #endif
> @@ -190,11 +194,6 @@
> }
> }
>
> - if (UseGHASHIntrinsics) {
> - warning("GHASH intrinsics are not available on this CPU");
> - FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
> - }
> -
> if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
> UseCRC32Intrinsics = true;
> }
> @@ -244,6 +243,15 @@
> }
> }
>
> + if (auxv & HWCAP_PMULL) {
> + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
> + FLAG_SET_DEFAULT(UseGHASHIntrinsics, true);
> + }
> + } else if (UseGHASHIntrinsics) {
> + warning("GHASH intrinsics are not available on this CPU");
> + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
> + }
> +
> // This machine allows unaligned memory accesses
> if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
> FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
> --- CUT HERE ---
More information about the aarch64-port-dev
mailing list