[aarch64-port-dev ] /hg/icedtea7-forest-aarch64/hotspot: 6 new changesets

Wed Nov 26 10:36:03 UTC 2014

[forwarding bounced check-in message from icedtea7-forest-aarch64 repo]

------ This is a copy of the message, including all the headers. ------

Return-path: <adinn at icedtea.classpath.org>
Received: from localhost ([127.0.0.1] helo=icedtea.classpath.org)
	by icedtea.classpath.org with esmtp (Exim 4.69)
	(envelope-from <adinn at icedtea.classpath.org>)
	id 1XtZvA-0002N2-6p
	for aarch64-port-dev at openjdk.java.net; Wed, 26 Nov 2014 10:34:04 +0000
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Date: Wed, 26 Nov 2014 10:34:04 +0000
Subject: /hg/icedtea7-forest-aarch64/hotspot: 6 new changesets
From: adinn at icedtea.classpath.org
X-Hg-Notification: changeset 8e3cc52cbcef
Message-Id:
<hg.8e3cc52cbcef.1416998044.-5017525213744097322 at icedtea.classpath.org>
To: aarch64-port-dev at openjdk.java.net

changeset 8e3cc52cbcef in /hg/icedtea7-forest-aarch64/hotspot
details:
http://icedtea.classpath.org/hg/icedtea7-forest-aarch64/hotspot?cmd=changeset;node=8e3cc52cbcef
author: adinn
date: Mon Nov 24 13:37:15 2014 +0000

	Add support for AES Intrinsics

	backport from jdk8 also incorporates 3 subsequent updates


changeset f7326d2a6cda in /hg/icedtea7-forest-aarch64/hotspot
details:
http://icedtea.classpath.org/hg/icedtea7-forest-aarch64/hotspot?cmd=changeset;node=f7326d2a6cda
author: adinn
date: Mon Nov 24 15:59:18 2014 +0000

	Use TLS for ThreadLocalStorage::thread()


changeset 405f393ec93d in /hg/icedtea7-forest-aarch64/hotspot
details:
http://icedtea.classpath.org/hg/icedtea7-forest-aarch64/hotspot?cmd=changeset;node=405f393ec93d
author: adinn
date: Mon Nov 24 16:56:09 2014 +0000

	Add char_array_equals intrinsic


changeset 023d218976e3 in /hg/icedtea7-forest-aarch64/hotspot
details:
http://icedtea.classpath.org/hg/icedtea7-forest-aarch64/hotspot?cmd=changeset;node=023d218976e3
author: adinn
date: Mon Nov 24 18:11:42 2014 +0000

	Add support for String.indexOf intrinsic


changeset 56958c314918 in /hg/icedtea7-forest-aarch64/hotspot
details:
http://icedtea.classpath.org/hg/icedtea7-forest-aarch64/hotspot?cmd=changeset;node=56958c314918
author: adinn
date: Tue Nov 25 11:10:14 2014 +0000

	A more efficient sequence for C1_MacroAssembler::float_cmp.


changeset 80e04c4cd4b2 in /hg/icedtea7-forest-aarch64/hotspot
details:
http://icedtea.classpath.org/hg/icedtea7-forest-aarch64/hotspot?cmd=changeset;node=80e04c4cd4b2
author: adinn
date: Tue Nov 25 17:36:55 2014 +0000

	Add support for pipeline scheduling


diffstat:

 src/cpu/aarch64/vm/aarch64.ad                          |  1377
++++++++++-----
 src/cpu/aarch64/vm/aarch64_ad.m4                       |    26 +-
 src/cpu/aarch64/vm/assembler_aarch64.cpp               |   590 +++++-
 src/cpu/aarch64/vm/assembler_aarch64.hpp               |   311 +-
 src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp       |    16 +-
 src/cpu/aarch64/vm/icache_aarch64.cpp                  |     5 +-
 src/cpu/aarch64/vm/icache_aarch64.hpp                  |     2 +-
 src/cpu/aarch64/vm/stubGenerator_aarch64.cpp           |   415 ++++
 src/cpu/aarch64/vm/vm_version_aarch64.cpp              |    26 +
 src/os_cpu/linux_aarch64/vm/globals_linux_aarch64.hpp  |     2 +
 src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.cpp |    29 +-
 src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.hpp |     4 +-
 12 files changed, 2045 insertions(+), 758 deletions(-)

diffs (truncated from 5305 to 500 lines):

diff -r 4868ef1912f1 -r 80e04c4cd4b2 src/cpu/aarch64/vm/aarch64.ad

--- a/src/cpu/aarch64/vm/aarch64.ad	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Nov 25 17:36:55 2014 +0000
@@ -3363,6 +3363,16 @@
   interface(CONST_INTER);
 %}

+operand immI_le_4()
+%{
+  predicate(n->get_int() <= 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 operand immI_31()
 %{
   predicate(n->get_int() == 31);
@@ -4687,17 +4697,14 @@
 attributes %{
   // ARM instructions are of fixed length
   fixed_size_instructions;        // Fixed size instructions TODO does
-  // TODO does this relate to how many instructions can be scheduled
-  // at once? just guess 8 for now
-  max_instructions_per_bundle = 8;   // Up to 8 instructions per bundle
+  max_instructions_per_bundle = 2;   // A53 = 2, A57 = 4
   // ARM instructions come in 32-bit word units
   instruction_unit_size = 4;         // An instruction is 4 bytes long
-  // TODO identify correct cache line size  just guess 64 for now
   instruction_fetch_unit_size = 64;  // The processor fetches one line
   instruction_fetch_units = 1;       // of 64 bytes

   // List of nop instructions
-  //nops( MachNop );
+  nops( MachNop );
 %}

 // We don't use an actual pipeline model so don't care about resources
@@ -4707,21 +4714,387 @@
 //----------RESOURCES----------------------------------------------------------
 // Resources are the functional units available to the machine

-resources( D0, D1, D2, DECODE = D0 | D1 | D2,
-           MS0, MS1, MS2, MEM = MS0 | MS1 | MS2,
-           BR, FPU,
-           ALU0, ALU1, ALU2, ALU = ALU0 | ALU1 | ALU2);
+resources( INS0, INS1, INS01 = INS0 | INS1,
+           ALU0, ALU1, ALU = ALU0 | ALU1,
+           MAC,
+           DIV,
+           BRANCH,
+           LDST,
+           NEON_FP);

 //----------PIPELINE
DESCRIPTION-----------------------------------------------
 // Pipeline Description specifies the stages in the machine's pipeline

 // Generic P2/P3 pipeline
-pipe_desc(S0, S1, S2, S3, S4, S5);
+pipe_desc(ISS, EX1, EX2, WR);

 //----------PIPELINE
CLASSES---------------------------------------------------
 // Pipeline Classes describe the stages in which input and output are
 // referenced by the hardware pipeline.

+//------- Integer ALU operations --------------------------
+
+// Integer ALU reg-reg operation
+// Operands needed in EX1, result generated in EX2
+// Eg.	ADD	x0, x1, x2
+pipe_class ialu_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  src2   : EX1(read);
+  INS01  : ISS; // Dual issue as instruction 0 or 1
+  ALU    : EX2;
+%}
+
+// Integer ALU reg-reg operation with constant shift
+// Shifted register must be available in LATE_ISS instead of EX1
+// Eg.	ADD	x0, x1, x2, LSL #2
+pipe_class ialu_reg_reg_shift(iRegI dst, iRegI src1, iRegI src2, immI
shift)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg operation with constant shift
+// Eg.	LSL	x0, x1, #shift
+pipe_class ialu_reg_shift(iRegI dst, iRegI src1)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg-reg operation with variable shift
+// Both operands must be available in LATE_ISS instead of EX1
+// Result is available in EX1 instead of EX2
+// Eg.	LSLV	x0, x1, x2
+pipe_class ialu_reg_reg_vshift(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX1(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX1;
+%}
+
+// Integer ALU reg-reg operation with extract
+// As for _vshift above, but result generated in EX2
+// Eg.	EXTR	x0, x1, x2, #N
+pipe_class ialu_reg_reg_extr(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS1   : ISS; // Can only dual issue as Instruction 1
+  ALU    : EX1;
+%}
+
+// Integer ALU reg operation
+// Eg.	NEG	x0, x1
+pipe_class ialu_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg mmediate operation
+// Eg.	ADD	x0, x1, #N
+pipe_class ialu_reg_imm(iRegI dst, iRegI src1)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU immediate operation (no source operands)
+// Eg.	MOV	x0, #N
+pipe_class ialu_imm(iRegI dst)
+%{
+  single_instruction;
+  dst    : EX1(write);
+  INS01  : ISS;
+  ALU    : EX1;
+%}
+
+//------- Compare operation -------------------------------
+
+// Compare reg-reg
+// Eg.	CMP	x0, x1
+pipe_class icmp_reg_reg(rFlagsReg cr, iRegI op1, iRegI op2)
+%{
+  single_instruction;
+//  fixed_latency(16);
+  cr     : EX2(write);
+  op1    : EX1(read);
+  op2    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Compare reg-reg
+// Eg.	CMP	x0, #N
+pipe_class icmp_reg_imm(rFlagsReg cr, iRegI op1)
+%{
+  single_instruction;
+//  fixed_latency(16);
+  cr     : EX2(write);
+  op1    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+//------- Conditional instructions ------------------------
+
+// Conditional no operands
+// Eg.	CSINC	x0, zr, zr, <cond>
+pipe_class icond_none(iRegI dst, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Conditional 2 operand
+// EG.	CSEL	X0, X1, X2, <cond>
+pipe_class icond_reg_reg(iRegI dst, iRegI src1, iRegI src2, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  src1   : EX1(read);
+  src2   : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Conditional 2 operand
+// EG.	CSEL	X0, X1, X2, <cond>
+pipe_class icond_reg(iRegI dst, iRegI src, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  src    : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+//------- Multiply pipeline operations --------------------
+
+// Multiply reg-reg
+// Eg.	MUL	w0, w1, w2
+pipe_class imul_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Multiply accumulate
+// Eg.	MADD	w0, w1, w2, w3
+pipe_class imac_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  src3   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Eg.	MUL	w0, w1, w2
+pipe_class lmul_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(3); // Maximum latency for 64 bit mul
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Multiply accumulate
+// Eg.	MADD	w0, w1, w2, w3
+pipe_class lmac_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3)
+%{
+  single_instruction;
+  fixed_latency(3); // Maximum latency for 64 bit mul
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  src3   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+//------- Divide pipeline operations --------------------
+
+// Eg.	SDIV	w0, w1, w2
+pipe_class idiv_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(8); // Maximum latency for 32 bit divide
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS0   : ISS; // Can only dual issue as instruction 0
+  DIV    : WR;
+%}
+
+// Eg.	SDIV	x0, x1, x2
+pipe_class ldiv_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(16); // Maximum latency for 64 bit divide
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS0   : ISS; // Can only dual issue as instruction 0
+  DIV    : WR;
+%}
+
+//------- Load pipeline operations ------------------------
+
+// Load - prefetch
+// Eg.	PFRM	<mem>
+pipe_class iload_prefetch(memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Load - reg, mem
+// Eg.	LDR	x0, <mem>
+pipe_class iload_reg_mem(iRegI dst, memory mem)
+%{
+  single_instruction;
+  dst    : WR(write);
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Load - reg, reg
+// Eg.	LDR	x0, [sp, x1]
+pipe_class iload_reg_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+//------- Store pipeline operations -----------------------
+
+// Store - zr, mem
+// Eg.	STR	zr, <mem>
+pipe_class istore_mem(memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Store - reg, mem
+// Eg.	STR	x0, <mem>
+pipe_class istore_reg_mem(iRegI src, memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  src    : EX2(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Store - reg, reg
+// Eg. STR	x0, [sp, x1]
+pipe_class istore_reg_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : ISS(read);
+  src    : EX2(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+//------- Store pipeline operations -----------------------
+
+// Branch
+pipe_class pipe_branch()
+%{
+  single_instruction;
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+// Conditional branch
+pipe_class pipe_branch_cond(rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+// Compare & Branch
+// EG.	CBZ/CBNZ
+pipe_class pipe_cmp_branch(iRegI op1)
+%{
+  single_instruction;
+  op1    : EX1(read);
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+//------- Synchronisation operations ----------------------
+
+// Any operation requiring serialization.
+// EG.	DMB/Atomic Ops/Load Acquire/Str Release
+pipe_class pipe_serial()
+%{
+  single_instruction;
+  force_serialization;
+  fixed_latency(16);
+  INS01  : ISS(2); // Cannot dual issue with any other instruction
+  LDST   : WR;
+%}
+
+// Generic big/slow expanded idiom - also serialized
+pipe_class pipe_slow()
+%{
+  instruction_count(10);
+  multiple_bundles;
+  force_serialization;
+  fixed_latency(16);
+  INS01  : ISS(2); // Cannot dual issue with any other instruction
+  LDST   : WR;
+%}
+
 // Empty pipeline class
 pipe_class pipe_class_empty()
 %{
@@ -4743,30 +5116,23 @@
   fixed_latency(16);
 %}

-// Pipeline class for traps.
-pipe_class pipe_class_trap()
+// Pipeline class for memory operations.
+pipe_class pipe_class_memory()
+%{
+  single_instruction;
+  fixed_latency(16);
+%}
+
+// Pipeline class for call.
+pipe_class pipe_class_call()
 %{
   single_instruction;
   fixed_latency(100);
 %}

-// Pipeline class for memory operations.
-pipe_class pipe_class_memory()
-%{
-  single_instruction;
-  fixed_latency(16);
-%}
-
-// Pipeline class for call.
-pipe_class pipe_class_call()
-%{
-  single_instruction;
-  fixed_latency(100);
-%}
-
 // Define the class for the Nop node.
 define %{
-   MachNop = pipe_class_default;
+  MachNop = pipe_class_empty;
 %}

 %}
@@ -4806,7 +5172,7 @@

   ins_encode(aarch64_enc_ldrsbw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Byte (8 bit signed) into long
@@ -4819,7 +5185,7 @@

   ins_encode(aarch64_enc_ldrsb(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Byte (8 bit unsigned)
@@ -4832,7 +5198,7 @@

   ins_encode(aarch64_enc_ldrb(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}