/hg/icedtea8-forest/hotspot: 4 new changesets
andrew at icedtea.classpath.org
andrew at icedtea.classpath.org
Wed Nov 7 06:14:28 UTC 2018
changeset b3d6f0af9a4d in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=b3d6f0af9a4d
author: andrew
date: Tue Nov 06 22:49:55 2018 +0000
Added tag icedtea-3.9.0 for changeset d78088224b98
changeset 678bb67e02ae in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=678bb67e02ae
author: andrew
date: Tue Nov 06 22:53:47 2018 +0000
Added tag icedtea-3.10.0pre00 for changeset d78088224b98
changeset 567e95df42f8 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=567e95df42f8
author: gromero
date: Mon Sep 24 17:18:38 2018 -0400
8131048, PR3574, RH1498936: ppc implement CRC32 intrinsic
Reviewed-by: goetz
changeset de4f1f9fbcc5 in /hg/icedtea8-forest/hotspot
details: http://icedtea.classpath.org/hg/icedtea8-forest/hotspot?cmd=changeset;node=de4f1f9fbcc5
author: mdoerr
date: Thu Sep 22 12:17:24 2016 +0200
8164920, PR3574, RH1498936: ppc: enhancement of CRC32 intrinsic
Reviewed-by: goetz, mdoerr
Contributed-by: Hiroshi H Horii <horii at jp.ibm.com>
diffstat:
.hgtags | 2 +
src/cpu/ppc/vm/assembler_ppc.hpp | 8 +
src/cpu/ppc/vm/assembler_ppc.inline.hpp | 4 +
src/cpu/ppc/vm/interpreterGenerator_ppc.hpp | 2 +
src/cpu/ppc/vm/macroAssembler_ppc.cpp | 972 ++++++++++++++++++++++++++++
src/cpu/ppc/vm/macroAssembler_ppc.hpp | 28 +
src/cpu/ppc/vm/stubGenerator_ppc.cpp | 86 ++
src/cpu/ppc/vm/stubRoutines_ppc_64.cpp | 763 +++++++++++++++++++++-
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp | 38 +-
src/cpu/ppc/vm/templateInterpreter_ppc.cpp | 191 +++++-
src/cpu/ppc/vm/vm_version_ppc.cpp | 17 +-
src/cpu/ppc/vm/vm_version_ppc.hpp | 3 +
src/share/vm/opto/library_call.cpp | 28 +-
src/share/vm/opto/runtime.cpp | 19 +-
14 files changed, 2132 insertions(+), 29 deletions(-)
diffs (truncated from 2433 to 500 lines):
diff -r d78088224b98 -r de4f1f9fbcc5 .hgtags
--- a/.hgtags Tue Jul 17 15:03:25 2018 +0100
+++ b/.hgtags Thu Sep 22 12:17:24 2016 +0200
@@ -1228,3 +1228,5 @@
e4f39d283b55faf6074308797615298bd1a45a66 jdk8u181-b11
464ed8cea5d6cdbfacc9be7035297af88f57f708 jdk8u181-b12
9062a259cecfe8e1f3386e2982eb77bd117c81e1 jdk8u181-b31
+d78088224b9836edf36034d076e7eee89a2a9b83 icedtea-3.9.0
+d78088224b9836edf36034d076e7eee89a2a9b83 icedtea-3.10.0pre00
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/assembler_ppc.hpp
--- a/src/cpu/ppc/vm/assembler_ppc.hpp Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp Thu Sep 22 12:17:24 2016 +0200
@@ -468,6 +468,10 @@
LVSL_OPCODE = (31u << OPCODE_SHIFT | 6u << 1),
LVSR_OPCODE = (31u << OPCODE_SHIFT | 38u << 1),
+ // Vector-Scalar (VSX) instruction support.
+ MTVSRD_OPCODE = (31u << OPCODE_SHIFT | 179u << 1),
+ MFVSRD_OPCODE = (31u << OPCODE_SHIFT | 51u << 1),
+
// Vector Permute and Formatting
VPKPX_OPCODE = (4u << OPCODE_SHIFT | 782u ),
VPKSHSS_OPCODE = (4u << OPCODE_SHIFT | 398u ),
@@ -1938,6 +1942,10 @@
inline void mtvscr( VectorRegister b);
inline void mfvscr( VectorRegister d);
+ // Vector-Scalar (VSX) instructions.
+ inline void mtvrd( VectorRegister d, Register a);
+ inline void mfvrd( Register a, VectorRegister d);
+
// AES (introduced with Power 8)
inline void vcipher( VectorRegister d, VectorRegister a, VectorRegister b);
inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/assembler_ppc.inline.hpp
--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp Thu Sep 22 12:17:24 2016 +0200
@@ -623,6 +623,10 @@
inline void Assembler::lvsl( VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); }
inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); }
+// Vector-Scalar (VSX) instructions.
+inline void Assembler::mtvrd( VectorRegister d, Register a) { emit_int32( MTVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
+inline void Assembler::mfvrd( Register a, VectorRegister d) { emit_int32( MFVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
+
inline void Assembler::vpkpx( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE | vrt(d) | vra(a) | vrb(b)); }
inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/interpreterGenerator_ppc.hpp
--- a/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp Thu Sep 22 12:17:24 2016 +0200
@@ -33,5 +33,7 @@
address generate_abstract_entry(void);
address generate_accessor_entry(void);
address generate_Reference_get_entry(void);
+ address generate_CRC32_update_entry();
+ address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
#endif // CPU_PPC_VM_INTERPRETERGENERATOR_PPC_HPP
diff -r d78088224b98 -r de4f1f9fbcc5 src/cpu/ppc/vm/macroAssembler_ppc.cpp
--- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp Tue Jul 17 15:03:25 2018 +0100
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp Thu Sep 22 12:17:24 2016 +0200
@@ -49,6 +49,7 @@
#else
#define BLOCK_COMMENT(str) block_comment(str)
#endif
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
#ifdef ASSERT
// On RISC, there's no benefit to verifying instruction boundaries.
@@ -3022,6 +3023,977 @@
bind(Ldone_false);
}
+// Helpers for Intrinsic Emitters
+//
+// Revert the byte order of a 32bit value in a register
+// src: 0x44556677
+// dst: 0x77665544
+// Three steps to obtain the result:
+// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
+// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
+// This value initializes dst.
+// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
+// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
+// This value is mask inserted into dst with a [0..23] mask of 1s.
+// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
+// This value is mask inserted into dst with a [8..15] mask of 1s.
+void MacroAssembler::load_reverse_32(Register dst, Register src) {
+ assert_different_registers(dst, src);
+
+ rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
+ rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
+ rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
+}
+
+// Calculate the column addresses of the crc32 lookup table into distinct registers.
+// This loop-invariant calculation is moved out of the loop body, reducing the loop
+// body size from 20 to 16 instructions.
+// Returns the offset that was used to calculate the address of column tc3.
+// Due to register shortage, setting tc3 may overwrite table. With the return offset
+// at hand, the original table address can be easily reconstructed.
+int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
+
+#ifdef VM_LITTLE_ENDIAN
+ // This is what we implement (the DOLIT4 part):
+ // ========================================================================= */
+ // #define DOLIT4 c ^= *buf4++; \
+ // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+ // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+ // ========================================================================= */
+ const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
+ const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
+ const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
+ const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
+#else
+ // This is what we implement (the DOBIG4 part):
+ // =========================================================================
+ // #define DOBIG4 c ^= *++buf4; \
+ // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+ // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+ // =========================================================================
+ const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
+ const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
+ const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
+ const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
+#endif
+ assert_different_registers(table, tc0, tc1, tc2);
+ assert(table == tc3, "must be!");
+
+ if (ix0 != 0) addi(tc0, table, ix0);
+ if (ix1 != 0) addi(tc1, table, ix1);
+ if (ix2 != 0) addi(tc2, table, ix2);
+ if (ix3 != 0) addi(tc3, table, ix3);
+
+ return ix3;
+}
+
+/**
+ * uint32_t crc;
+ * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+ */
+void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
+ assert_different_registers(crc, table, tmp);
+ assert_different_registers(val, table);
+
+ if (crc == val) { // Must rotate first to use the unmodified value.
+ rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+ // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
+ srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
+ } else {
+ srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
+ rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+ }
+ lwzx(tmp, table, tmp);
+ xorr(crc, crc, tmp);
+}
+
+/**
+ * uint32_t crc;
+ * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+ */
+void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+ fold_byte_crc32(crc, crc, table, tmp);
+}
+
+/**
+ * Emits code to update CRC-32 with a byte value according to constants in table.
+ *
+ * @param [in,out]crc Register containing the crc.
+ * @param [in]val Register containing the byte to fold into the CRC.
+ * @param [in]table Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+ BLOCK_COMMENT("update_byte_crc32:");
+ xorr(val, val, crc);
+ fold_byte_crc32(crc, val, table, val);
+}
+
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ */
+void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
+ Register data, bool loopAlignment, bool invertCRC) {
+ assert_different_registers(crc, buf, len, table, data);
+
+ Label L_mainLoop, L_done;
+ const int mainLoop_stepping = 1;
+ const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
+
+ // Process all bytes in a single-byte loop.
+ cmpdi(CCR0, len, 0); // Anything to do?
+ mtctr(len);
+ beq(CCR0, L_done);
+
+ if (invertCRC) {
+ nand(crc, crc, crc); // ~c
+ }
+
+ align(mainLoop_alignment);
+ BIND(L_mainLoop);
+ lbz(data, 0, buf); // Byte from buffer, zero-extended.
+ addi(buf, buf, mainLoop_stepping); // Advance buffer position.
+ update_byte_crc32(crc, data, table);
+ bdnz(L_mainLoop); // Iterate.
+
+ if (invertCRC) {
+ nand(crc, crc, crc); // ~c
+ }
+
+ bind(L_done);
+}
+
+/**
+ * Emits code to update CRC-32 with a 4-byte value according to constants in table
+ * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
+ */
+// A not on the lookup table address(es):
+// The lookup table consists of two sets of four columns each.
+// The columns {0..3} are used for little-endian machines.
+// The columns {4..7} are used for big-endian machines.
+// To save the effort of adding the column offset to the table address each time
+// a table element is looked up, it is possible to pass the pre-calculated
+// column addresses.
+// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
+void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
+ Register t0, Register t1, Register t2, Register t3,
+ Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert_different_registers(crc, t3);
+
+ // XOR crc with next four bytes of buffer.
+ lwz(t3, bufDisp, buf);
+ if (bufInc != 0) {
+ addi(buf, buf, bufInc);
+ }
+ xorr(t3, t3, crc);
+
+ // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
+ rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
+ rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
+ rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
+ rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
+
+ // Use the pre-calculated column addresses.
+ // Load pre-calculated table values.
+ lwzx(t0, tc0, t0);
+ lwzx(t1, tc1, t1);
+ lwzx(t2, tc2, t2);
+ lwzx(t3, tc3, t3);
+
+ // Calculate new crc from table values.
+ xorr(t0, t0, t1);
+ xorr(t2, t2, t3);
+ xorr(crc, t0, t2); // Now crc contains the final checksum value.
+}
+
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * Uses R9..R12 as work register. Must be saved/restored by caller!
+ */
+void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
+ Register t0, Register t1, Register t2, Register t3,
+ Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert_different_registers(crc, buf, len, table);
+
+ Label L_mainLoop, L_tail;
+ Register tmp = t0;
+ Register data = t0;
+ Register tmp2 = t1;
+ const int mainLoop_stepping = 8;
+ const int tailLoop_stepping = 1;
+ const int log_stepping = exact_log2(mainLoop_stepping);
+ const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+ const int complexThreshold = 2*mainLoop_stepping;
+
+ // Don't test for len <= 0 here. This pathological case should not occur anyway.
+ // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+ // The situation itself is detected and handled correctly by the conditional branches
+ // following aghi(len, -stepping) and aghi(len, +stepping).
+ assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+
+ BLOCK_COMMENT("kernel_crc32_2word {");
+
+ nand(crc, crc, crc); // ~c
+
+ // Check for short (<mainLoop_stepping) buffer.
+ cmpdi(CCR0, len, complexThreshold);
+ blt(CCR0, L_tail);
+
+ // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+ // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+ {
+ // Align buf addr to mainLoop_stepping boundary.
+ neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
+ rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+
+ if (complexThreshold > mainLoop_stepping) {
+ sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ } else {
+ sub(tmp, len, tmp2); // Remaining bytes for main loop.
+ cmpdi(CCR0, tmp, mainLoop_stepping);
+ blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
+ mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ }
+ update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+ }
+
+ srdi(tmp2, len, log_stepping); // #iterations for mainLoop
+ andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
+ mtctr(tmp2);
+
+#ifdef VM_LITTLE_ENDIAN
+ Register crc_rv = crc;
+#else
+ Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
+ // Occupies tmp, but frees up crc.
+ load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
+ tmp = crc;
+#endif
+
+ int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+
+ align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
+ BIND(L_mainLoop);
+ update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+ update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+ bdnz(L_mainLoop);
+
+#ifndef VM_LITTLE_ENDIAN
+ load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
+ tmp = crc_rv; // Tmp uses it's original register again.
+#endif
+
+ // Restore original table address for tailLoop.
+ if (reconstructTableOffset != 0) {
+ addi(table, table, -reconstructTableOffset);
+ }
+
+ // Process last few (<complexThreshold) bytes of buffer.
+ BIND(L_tail);
+ update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+
+ nand(crc, crc, crc); // ~c
+ BLOCK_COMMENT("} kernel_crc32_2word");
+}
+
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * uses R9..R12 as work register. Must be saved/restored by caller!
+ */
+void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
+ Register t0, Register t1, Register t2, Register t3,
+ Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert_different_registers(crc, buf, len, table);
+
+ Label L_mainLoop, L_tail;
+ Register tmp = t0;
+ Register data = t0;
+ Register tmp2 = t1;
+ const int mainLoop_stepping = 4;
+ const int tailLoop_stepping = 1;
+ const int log_stepping = exact_log2(mainLoop_stepping);
+ const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+ const int complexThreshold = 2*mainLoop_stepping;
+
+ // Don't test for len <= 0 here. This pathological case should not occur anyway.
+ // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+ // The situation itself is detected and handled correctly by the conditional branches
+ // following aghi(len, -stepping) and aghi(len, +stepping).
+ assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+
+ BLOCK_COMMENT("kernel_crc32_1word {");
+
+ nand(crc, crc, crc); // ~c
+
+ // Check for short (<mainLoop_stepping) buffer.
+ cmpdi(CCR0, len, complexThreshold);
+ blt(CCR0, L_tail);
+
+ // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+ // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+ {
+ // Align buf addr to mainLoop_stepping boundary.
+ neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
+ rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+
+ if (complexThreshold > mainLoop_stepping) {
+ sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ } else {
+ sub(tmp, len, tmp2); // Remaining bytes for main loop.
+ cmpdi(CCR0, tmp, mainLoop_stepping);
+ blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
+ mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ }
+ update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+ }
+
+ srdi(tmp2, len, log_stepping); // #iterations for mainLoop
+ andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
+ mtctr(tmp2);
+
+#ifdef VM_LITTLE_ENDIAN
+ Register crc_rv = crc;
+#else
+ Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
+ // Occupies tmp, but frees up crc.
+ load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data.
+ tmp = crc;
+#endif
+
+ int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+
+ align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
+ BIND(L_mainLoop);
+ update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+ bdnz(L_mainLoop);
+
+#ifndef VM_LITTLE_ENDIAN
+ load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
+ tmp = crc_rv; // Tmp uses it's original register again.
+#endif
+
+ // Restore original table address for tailLoop.
+ if (reconstructTableOffset != 0) {
+ addi(table, table, -reconstructTableOffset);
+ }
+
+ // Process last few (<complexThreshold) bytes of buffer.
+ BIND(L_tail);
+ update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+
+ nand(crc, crc, crc); // ~c
+ BLOCK_COMMENT("} kernel_crc32_1word");
+}
+
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * Uses R7_ARG5, R8_ARG6 as work registers.
+ */
+void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
+ Register t0, Register t1, Register t2, Register t3) {
+ assert_different_registers(crc, buf, len, table);
+
+ Register data = t0; // Holds the current byte to be folded into crc.
+
+ BLOCK_COMMENT("kernel_crc32_1byte {");
+
+ // Process all bytes in a single-byte loop.
+ update_byteLoop_crc32(crc, buf, len, table, data, true, true);
+
+ BLOCK_COMMENT("} kernel_crc32_1byte");
+}
+
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ * @param constants register pointing to CRC table for 128-bit aligned memory
+ * @param barretConstants register pointing to table for barrett reduction
+ * @param t0 volatile register
+ * @param t1 volatile register
+ * @param t2 volatile register
+ * @param t3 volatile register
+ */
+void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
+ Register constants, Register barretConstants,
+ Register t0, Register t1, Register t2, Register t3, Register t4) {
+ assert_different_registers(crc, buf, len, table);
+
+ Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
+
+ Register prealign = t0;
+ Register postalign = t0;
+
+ BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
+
+ // 1. use kernel_crc32_1word for shorter than 384bit
+ clrldi(len, len, 32);
More information about the distro-pkg-dev
mailing list