/hg/icedtea8-forest/hotspot: 4 new changesets

Wed Nov 7 06:14:28 UTC 2018

changeset b3d6f0af9a4d in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=b3d6f0af9a4d
author: andrew
date: Tue Nov 06 22:49:55 2018 +0000

	Added tag icedtea-3.9.0 for changeset d78088224b98


changeset 678bb67e02ae in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=678bb67e02ae
author: andrew
date: Tue Nov 06 22:53:47 2018 +0000

	Added tag icedtea-3.10.0pre00 for changeset d78088224b98


changeset 567e95df42f8 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=567e95df42f8
author: gromero
date: Mon Sep 24 17:18:38 2018 -0400

	8131048, PR3574, RH1498936: ppc implement CRC32 intrinsic
	Reviewed-by: goetz


changeset de4f1f9fbcc5 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=de4f1f9fbcc5
author: mdoerr
date: Thu Sep 22 12:17:24 2016 +0200

	8164920, PR3574, RH1498936: ppc: enhancement of CRC32 intrinsic
	Reviewed-by: goetz, mdoerr
	Contributed-by: Hiroshi H Horii <horii at jp.ibm.com>


diffstat:

 .hgtags                                     |    2 +
 src/cpu/ppc/vm/assembler_ppc.hpp            |    8 +
 src/cpu/ppc/vm/assembler_ppc.inline.hpp     |    4 +
 src/cpu/ppc/vm/interpreterGenerator_ppc.hpp |    2 +
 src/cpu/ppc/vm/macroAssembler_ppc.cpp       |  972 ++++++++++++++++++++++++++++
 src/cpu/ppc/vm/macroAssembler_ppc.hpp       |   28 +
 src/cpu/ppc/vm/stubGenerator_ppc.cpp        |   86 ++
 src/cpu/ppc/vm/stubRoutines_ppc_64.cpp      |  763 +++++++++++++++++++++-
 src/cpu/ppc/vm/stubRoutines_ppc_64.hpp      |   38 +-
 src/cpu/ppc/vm/templateInterpreter_ppc.cpp  |  191 +++++-
 src/cpu/ppc/vm/vm_version_ppc.cpp           |   17 +-
 src/cpu/ppc/vm/vm_version_ppc.hpp           |    3 +
 src/share/vm/opto/library_call.cpp          |   28 +-
 src/share/vm/opto/runtime.cpp               |   19 +-
 14 files changed, 2132 insertions(+), 29 deletions(-)

diffs (truncated from 2433 to 500 lines):

diff -r d78088224b98 -r de4f1f9fbcc5 .hgtags

--- a/.hgtags	Tue Jul 17 15:03:25 2018 +0100
+++ b/.hgtags	Thu Sep 22 12:17:24 2016 +0200
@@ -1228,3 +1228,5 @@
 e4f39d283b55faf6074308797615298bd1a45a66 jdk8u181-b11
 464ed8cea5d6cdbfacc9be7035297af88f57f708 jdk8u181-b12
 9062a259cecfe8e1f3386e2982eb77bd117c81e1 jdk8u181-b31
+d78088224b9836edf36034d076e7eee89a2a9b83 icedtea-3.9.0
+d78088224b9836edf36034d076e7eee89a2a9b83 icedtea-3.10.0pre00
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/assembler_ppc.hpp
--- a/src/cpu/ppc/vm/assembler_ppc.hpp	Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Thu Sep 22 12:17:24 2016 +0200
@@ -468,6 +468,10 @@
     LVSL_OPCODE    = (31u << OPCODE_SHIFT |    6u << 1),
     LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),
 
+    // Vector-Scalar (VSX) instruction support.
+    MTVSRD_OPCODE  = (31u << OPCODE_SHIFT |  179u << 1),
+    MFVSRD_OPCODE  = (31u << OPCODE_SHIFT |   51u << 1),
+
     // Vector Permute and Formatting
     VPKPX_OPCODE   = (4u  << OPCODE_SHIFT |  782u     ),
     VPKSHSS_OPCODE = (4u  << OPCODE_SHIFT |  398u     ),
@@ -1938,6 +1942,10 @@
   inline void mtvscr(   VectorRegister b);
   inline void mfvscr(   VectorRegister d);
 
+  // Vector-Scalar (VSX) instructions.
+  inline void mtvrd(    VectorRegister  d, Register a);
+  inline void mfvrd(    Register        a, VectorRegister d);
+
   // AES (introduced with Power 8)
   inline void vcipher(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/assembler_ppc.inline.hpp
--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Thu Sep 22 12:17:24 2016 +0200
@@ -623,6 +623,10 @@
 inline void Assembler::lvsl(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 
+// Vector-Scalar (VSX) instructions.
+inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
+inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
+
 inline void Assembler::vpkpx(   VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE   | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/interpreterGenerator_ppc.hpp
--- a/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp	Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp	Thu Sep 22 12:17:24 2016 +0200
@@ -33,5 +33,7 @@
   address generate_abstract_entry(void);
   address generate_accessor_entry(void);
   address generate_Reference_get_entry(void);
+  address generate_CRC32_update_entry();
+  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
 
 #endif // CPU_PPC_VM_INTERPRETERGENERATOR_PPC_HPP
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/macroAssembler_ppc.cpp
--- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Thu Sep 22 12:17:24 2016 +0200
@@ -49,6 +49,7 @@
 #else
 #define BLOCK_COMMENT(str) block_comment(str)
 #endif
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 
 #ifdef ASSERT
 // On RISC, there's no benefit to verifying instruction boundaries.
@@ -3022,6 +3023,977 @@
   bind(Ldone_false);
 }
 
+// Helpers for Intrinsic Emitters
+//
+// Revert the byte order of a 32bit value in a register
+//   src: 0x44556677
+//   dst: 0x77665544
+// Three steps to obtain the result:
+//  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
+//     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
+//     This value initializes dst.
+//  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
+//     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
+//     This value is mask inserted into dst with a [0..23] mask of 1s.
+//  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
+//     This value is mask inserted into dst with a [8..15] mask of 1s.
+void MacroAssembler::load_reverse_32(Register dst, Register src) {
+  assert_different_registers(dst, src);
+
+  rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
+  rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
+  rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
+}
+
+// Calculate the column addresses of the crc32 lookup table into distinct registers.
+// This loop-invariant calculation is moved out of the loop body, reducing the loop
+// body size from 20 to 16 instructions.
+// Returns the offset that was used to calculate the address of column tc3.
+// Due to register shortage, setting tc3 may overwrite table. With the return offset
+// at hand, the original table address can be easily reconstructed.
+int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
+
+#ifdef VM_LITTLE_ENDIAN
+  // This is what we implement (the DOLIT4 part):
+  // ========================================================================= */
+  // #define DOLIT4 c ^= *buf4++; \
+  //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+  //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+  // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+  // ========================================================================= */
+  const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
+  const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
+  const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
+  const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
+#else
+  // This is what we implement (the DOBIG4 part):
+  // =========================================================================
+  // #define DOBIG4 c ^= *++buf4; \
+  //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+  //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+  // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+  // =========================================================================
+  const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
+  const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
+  const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
+  const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
+#endif
+  assert_different_registers(table, tc0, tc1, tc2);
+  assert(table == tc3, "must be!");
+
+  if (ix0 != 0) addi(tc0, table, ix0);
+  if (ix1 != 0) addi(tc1, table, ix1);
+  if (ix2 != 0) addi(tc2, table, ix2);
+  if (ix3 != 0) addi(tc3, table, ix3);
+
+  return ix3;
+}
+
+/**
+ * uint32_t crc;
+ * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+ */
+void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
+  assert_different_registers(crc, table, tmp);
+  assert_different_registers(val, table);
+
+  if (crc == val) {                   // Must rotate first to use the unmodified value.
+    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+                                      // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
+    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
+  } else {
+    srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
+    rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+  }
+  lwzx(tmp, table, tmp);
+  xorr(crc, crc, tmp);
+}
+
+/**
+ * uint32_t crc;
+ * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+ */
+void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+  fold_byte_crc32(crc, crc, table, tmp);
+}
+
+/**
+ * Emits code to update CRC-32 with a byte value according to constants in table.
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]val       Register containing the byte to fold into the CRC.
+ * @param [in]table     Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+  BLOCK_COMMENT("update_byte_crc32:");
+  xorr(val, val, crc);
+  fold_byte_crc32(crc, val, table, val);
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register pointing to CRC table
+ */
+void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
+                                           Register data, bool loopAlignment, bool invertCRC) {
+  assert_different_registers(crc, buf, len, table, data);
+
+  Label L_mainLoop, L_done;
+  const int mainLoop_stepping  = 1;
+  const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
+
+  // Process all bytes in a single-byte loop.
+  cmpdi(CCR0, len, 0);                           // Anything to do?
+  mtctr(len);
+  beq(CCR0, L_done);
+
+  if (invertCRC) {
+    nand(crc, crc, crc);                         // ~c
+  }
+
+  align(mainLoop_alignment);
+  BIND(L_mainLoop);
+    lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
+    addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
+    update_byte_crc32(crc, data, table);
+    bdnz(L_mainLoop);                            // Iterate.
+
+  if (invertCRC) {
+    nand(crc, crc, crc);                         // ~c
+  }
+
+  bind(L_done);
+}
+
+/**
+ * Emits code to update CRC-32 with a 4-byte value according to constants in table
+ * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
+ */
+// A not on the lookup table address(es):
+// The lookup table consists of two sets of four columns each.
+// The columns {0..3} are used for little-endian machines.
+// The columns {4..7} are used for big-endian machines.
+// To save the effort of adding the column offset to the table address each time
+// a table element is looked up, it is possible to pass the pre-calculated
+// column addresses.
+// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
+void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
+                                        Register t0,  Register t1,  Register t2,  Register t3,
+                                        Register tc0, Register tc1, Register tc2, Register tc3) {
+  assert_different_registers(crc, t3);
+
+  // XOR crc with next four bytes of buffer.
+  lwz(t3, bufDisp, buf);
+  if (bufInc != 0) {
+    addi(buf, buf, bufInc);
+  }
+  xorr(t3, t3, crc);
+
+  // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
+  rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
+  rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
+  rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
+  rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
+
+  // Use the pre-calculated column addresses.
+  // Load pre-calculated table values.
+  lwzx(t0, tc0, t0);
+  lwzx(t1, tc1, t1);
+  lwzx(t2, tc2, t2);
+  lwzx(t3, tc3, t3);
+
+  // Calculate new crc from table values.
+  xorr(t0,  t0, t1);
+  xorr(t2,  t2, t3);
+  xorr(crc, t0, t2);  // Now crc contains the final checksum value.
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * Uses R9..R12 as work register. Must be saved/restored by caller!
+ */
+void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
+                                        Register t0,  Register t1,  Register t2,  Register t3,
+                                        Register tc0, Register tc1, Register tc2, Register tc3) {
+  assert_different_registers(crc, buf, len, table);
+
+  Label L_mainLoop, L_tail;
+  Register  tmp  = t0;
+  Register  data = t0;
+  Register  tmp2 = t1;
+  const int mainLoop_stepping  = 8;
+  const int tailLoop_stepping  = 1;
+  const int log_stepping       = exact_log2(mainLoop_stepping);
+  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+  const int complexThreshold   = 2*mainLoop_stepping;
+
+  // Don't test for len <= 0 here. This pathological case should not occur anyway.
+  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+  // The situation itself is detected and handled correctly by the conditional branches
+  // following  aghi(len, -stepping) and aghi(len, +stepping).
+  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+
+  BLOCK_COMMENT("kernel_crc32_2word {");
+
+  nand(crc, crc, crc);                           // ~c
+
+  // Check for short (<mainLoop_stepping) buffer.
+  cmpdi(CCR0, len, complexThreshold);
+  blt(CCR0, L_tail);
+
+  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+  {
+    // Align buf addr to mainLoop_stepping boundary.
+    neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
+    rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+
+    if (complexThreshold > mainLoop_stepping) {
+      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+    } else {
+      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
+      cmpdi(CCR0, tmp, mainLoop_stepping);
+      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
+      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+    }
+    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+  }
+
+  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
+  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
+  mtctr(tmp2);
+
+#ifdef VM_LITTLE_ENDIAN
+  Register crc_rv = crc;
+#else
+  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
+                                                 // Occupies tmp, but frees up crc.
+  load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
+  tmp = crc;
+#endif
+
+  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+
+  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
+  BIND(L_mainLoop);
+    update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+    update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+    bdnz(L_mainLoop);
+
+#ifndef VM_LITTLE_ENDIAN
+  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
+  tmp = crc_rv;                                  // Tmp uses it's original register again.
+#endif
+
+  // Restore original table address for tailLoop.
+  if (reconstructTableOffset != 0) {
+    addi(table, table, -reconstructTableOffset);
+  }
+
+  // Process last few (<complexThreshold) bytes of buffer.
+  BIND(L_tail);
+  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+
+  nand(crc, crc, crc);                           // ~c
+  BLOCK_COMMENT("} kernel_crc32_2word");
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * uses R9..R12 as work register. Must be saved/restored by caller!
+ */
+void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
+                                        Register t0,  Register t1,  Register t2,  Register t3,
+                                        Register tc0, Register tc1, Register tc2, Register tc3) {
+  assert_different_registers(crc, buf, len, table);
+
+  Label L_mainLoop, L_tail;
+  Register  tmp          = t0;
+  Register  data         = t0;
+  Register  tmp2         = t1;
+  const int mainLoop_stepping  = 4;
+  const int tailLoop_stepping  = 1;
+  const int log_stepping       = exact_log2(mainLoop_stepping);
+  const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+  const int complexThreshold   = 2*mainLoop_stepping;
+
+  // Don't test for len <= 0 here. This pathological case should not occur anyway.
+  // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+  // The situation itself is detected and handled correctly by the conditional branches
+  // following  aghi(len, -stepping) and aghi(len, +stepping).
+  assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+
+  BLOCK_COMMENT("kernel_crc32_1word {");
+
+  nand(crc, crc, crc);                           // ~c
+
+  // Check for short (<mainLoop_stepping) buffer.
+  cmpdi(CCR0, len, complexThreshold);
+  blt(CCR0, L_tail);
+
+  // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+  // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+  {
+    // Align buf addr to mainLoop_stepping boundary.
+    neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
+    rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+
+    if (complexThreshold > mainLoop_stepping) {
+      sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+    } else {
+      sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
+      cmpdi(CCR0, tmp, mainLoop_stepping);
+      blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
+      mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+    }
+    update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+  }
+
+  srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
+  andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
+  mtctr(tmp2);
+
+#ifdef VM_LITTLE_ENDIAN
+  Register crc_rv = crc;
+#else
+  Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
+                                                 // Occupies tmp, but frees up crc.
+  load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
+  tmp = crc;
+#endif
+
+  int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+
+  align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
+  BIND(L_mainLoop);
+    update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+    bdnz(L_mainLoop);
+
+#ifndef VM_LITTLE_ENDIAN
+  load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
+  tmp = crc_rv;                                  // Tmp uses it's original register again.
+#endif
+
+  // Restore original table address for tailLoop.
+  if (reconstructTableOffset != 0) {
+    addi(table, table, -reconstructTableOffset);
+  }
+
+  // Process last few (<complexThreshold) bytes of buffer.
+  BIND(L_tail);
+  update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+
+  nand(crc, crc, crc);                           // ~c
+  BLOCK_COMMENT("} kernel_crc32_1word");
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * Uses R7_ARG5, R8_ARG6 as work registers.
+ */
+void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
+                                        Register t0,  Register t1,  Register t2,  Register t3) {
+  assert_different_registers(crc, buf, len, table);
+
+  Register  data = t0;                   // Holds the current byte to be folded into crc.
+
+  BLOCK_COMMENT("kernel_crc32_1byte {");
+
+  // Process all bytes in a single-byte loop.
+  update_byteLoop_crc32(crc, buf, len, table, data, true, true);
+
+  BLOCK_COMMENT("} kernel_crc32_1byte");
+}
+
+/**
+ * @param crc             register containing existing CRC (32-bit)
+ * @param buf             register pointing to input byte buffer (byte*)
+ * @param len             register containing number of bytes
+ * @param table           register pointing to CRC table
+ * @param constants       register pointing to CRC table for 128-bit aligned memory
+ * @param barretConstants register pointing to table for barrett reduction
+ * @param t0              volatile register
+ * @param t1              volatile register
+ * @param t2              volatile register
+ * @param t3              volatile register
+ */
+void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
+                        Register constants,  Register barretConstants,
+                        Register t0,  Register t1, Register t2, Register t3, Register t4) {
+  assert_different_registers(crc, buf, len, table);
+
+  Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
+
+  Register  prealign     = t0;
+  Register  postalign    = t0;
+
+  BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
+
+  // 1. use kernel_crc32_1word for shorter than 384bit
+  clrldi(len, len, 32);