RFR: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions

Tue May 28 02:10:20 UTC 2024

Hi, We want to support vector-scalar and vector-immediate arithmetic instructions, It was implemented by referring to RVV v1.0 [1]. please take a look and have some reviews. Thanks a lot.
We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes.

For example, we can use the following command to print the Opto JIT Code of a jtreg test case:

/home/zifeihan/jtreg/bin/jtreg \
-v:default \
-concurrency:16 -timeout:50 \
-javaoption:-XX:+UnlockExperimentalVMOptions \
-javaoption:-XX:+UseRVV \
-javaoption:-XX:+PrintOptoAssembly \
-javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \
-jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \
/home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java

we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation.

vadd_immI Node

16c     addw  R11, R10, zr	#@convI2L_reg_reg
170     add R9, R31, R11	# ptr, #@addP_reg_reg
174     addi  R9, R9, #16	# ptr, #@addP_reg_imm
176     loadV V1, [R9]	# vector (rvv)
17e     vadd_immI V1, V1, #7
186     add R11, R15, R11	# ptr, #@addP_reg_reg
188     addi  R11, R11, #16	# ptr, #@addP_reg_imm
18a     storeV [R11], V1	# vector (rvv)

vadd_immI_masked Node

1e8     B31: #	out( B37 B32 ) <- in( B30 )  Freq: 76.2281
1e8     loadV V2, [R31]	# vector (rvv)
1f0     vloadmask V0, V1
1f8     vadd_immI_masked V2, V2, #7
200     addi  R31, R10, #48	# ptr, #@addP_reg_imm
204     bgeu  R30, R7, B37	#@cmpU_branch  P=0.000001 C=-1.000000

vadd_regI Node

0c4     B4: #	out( B9 B5 ) <- in( B8 B3 )  Freq: 1
0c4     vloadcon V1	# generate iota indices
0cc     spill [sp, #4] -> R30	# spill size = 32
0ce     vmul_regI V1, V1, R30
0d6     spill [sp, #0] -> R29	# spill size = 32
0d8     vadd_regI V1, V1, R29

vadd_regI_masked Node

244     B36: #	out( B33 B37 ) <- in( B35 )  Freq: 7427.81
244     # castII of R30, #@castII
244     addw  R31, R30, zr	#@convI2L_reg_reg
248     spill [sp, #32] -> R10	# spill size = 64
24a     add R10, R10, R31	# ptr, #@addP_reg_reg
24c     addi  R10, R10, #16	# ptr, #@addP_reg_imm
24e     loadV V2, [R10]	# vector (rvv)
256     vloadmask V0, V1
25e     vadd_regI_masked V2, V2, R29

vsub_regI Node

112     B20: #	out( B63 B21 ) <- in( B19 )  Freq: 77.0107
112     # castII of R20, #@castII
112     addw  R11, R20, zr	#@convI2L_reg_reg
116     add R12, R10, R11	# ptr, #@addP_reg_reg
11a     addi  R12, R12, #16	# ptr, #@addP_reg_imm
11c     loadV V1, [R12]	# vector (rvv)
124     vsub_regI V1, V1, R31
12c     bgeu  R20, R29, B63	#@cmpU_branch  P=0.000001 C=-1.000000

vsub_regI_masked Node

1e8     B31: #	out( B37 B32 ) <- in( B30 )  Freq: 76.2281
1e8     loadV V2, [R31]	# vector (rvv)
1f0     vloadmask V0, V1
1f8     vsub_regI_masked V2, V2, R29
200     addi  R31, R10, #48	# ptr, #@addP_reg_imm
204     bgeu  R30, R7, B37	#@cmpU_branch  P=0.000001 C=-1.000000

vmul_regI Node

0ca     B4: #	out( B9 B5 ) <- in( B8 B3 )  Freq: 1
0ca     vloadcon V1	# generate iota indices
0d2     spill [sp, #0] -> R29	# spill size = 64
0d4     lwu  R7, [R29, #12]	# loadN, compressed ptr, #@loadN ! Field: jdk/internal/vm/vector/VectorSupport$VectorPayload.payload (constant)
0d8     decode_heap_oop  R7, R7	#@decodeHeapOop
0da     addi  R7, R7, #16	# ptr, #@addP_reg_imm
0dc     vmul_regI V1, V1, R30
0e4     loadV V2, [R7]	# vector (rvv)

vmul_regI_masked Node

198     addw  R30, R19, zr	#@convI2L_reg_reg
19c     spill [sp, #32] -> R31	# spill size = 64
19e     add R31, R31, R30	# ptr, #@addP_reg_reg
1a0     addi  R10, R31, #16	# ptr, #@addP_reg_imm
1a4     loadV V2, [R10]	# vector (rvv)
1ac     vloadmask V0, V1
1b4     vmul_regI_masked V2, V2, R29

We can test test/jdk/jdk/incubator/vector/Long256VectorTests.java in the same way, and looking at the Opto logs, we will see nodes similar to vadd_immL, vadd_immL_masked, vadd_regL, vadd_regL_masked, vsub_regL, vsub_regL_masked, vmul_regL, vmul_regL_masked.

vadd_immL Node

112     addw  R11, R9, zr	#@convI2L_reg_reg
116     slli  R11, R11, (#3 & 0x3f)	#@lShiftL_reg_imm
118     add R14, R29, R11	# ptr, #@addP_reg_reg
11c     addi  R14, R14, #16	# ptr, #@addP_reg_imm
11e     loadV V1, [R14]	# vector (rvv)
126     vadd_immL V1, V1, #7

vadd_immL_masked Node

194     addw  R30, R19, zr	#@convI2L_reg_reg
198     slli  R30, R30, (#3 & 0x3f)	#@lShiftL_reg_imm
19a     spill [sp, #32] -> R31	# spill size = 64
19c     add R31, R31, R30	# ptr, #@addP_reg_reg
19e     addi  R10, R31, #16	# ptr, #@addP_reg_imm
1a2     loadV V1, [R10]	# vector (rvv)
1aa     vadd_immL_masked V1, V1, #7

vadd_regL Node

104     B17: #	out( B20 ) <- in( B16 )  Freq: 0.99999
104     replicateL_imm5 V4, #1
10c     vadd_regL V4, V4, R17
114      -- 	// R23=Thread::current(), empty, #@tlsLoadP
114     mv R31, #0	# int, #@loadConI
116     j  B20	#@branch

vadd_regL_masked Node

198     addw  R30, R19, zr	#@convI2L_reg_reg
19c     slli  R30, R30, (#3 & 0x3f)	#@lShiftL_reg_imm
19e     spill [sp, #32] -> R31	# spill size = 64
1a0     add R31, R31, R30	# ptr, #@addP_reg_reg
1a2     addi  R10, R31, #16	# ptr, #@addP_reg_imm
1a6     loadV V1, [R10]	# vector (rvv)
1ae     vadd_regL_masked V1, V1, R11

vsub_regL Node

116     addw  R11, R19, zr	#@convI2L_reg_reg
11a     slli  R11, R11, (#3 & 0x3f)	#@lShiftL_reg_imm
11c     add R12, R31, R11	# ptr, #@addP_reg_reg
120     addi  R12, R12, #16	# ptr, #@addP_reg_imm
122     loadV V1, [R12]	# vector (rvv)
12a     vsub_regL V1, V1, R14

vsub_regL_masked Node

198     addw  R30, R19, zr	#@convI2L_reg_reg
19c     slli  R30, R30, (#3 & 0x3f)	#@lShiftL_reg_imm
19e     spill [sp, #32] -> R31	# spill size = 64
1a0     add R31, R31, R30	# ptr, #@addP_reg_reg
1a2     addi  R10, R31, #16	# ptr, #@addP_reg_imm
1a6     loadV V1, [R10]	# vector (rvv)
1ae     vsub_regL_masked V1, V1, R11

vmul_regL Node

0c2     vloadcon V1	# generate iota indices
0ca     spill [sp, #0] -> R29	# spill size = 64
0cc     lwu  R7, [R29, #12]	# loadN, compressed ptr, #@loadN ! Field: jdk/internal/vm/vector/VectorSupport$VectorPayload.payload (constant)
0d0     decode_heap_oop  R7, R7	#@decodeHeapOop
0d2     addi  R7, R7, #16	# ptr, #@addP_reg_imm
0d4     addw  R28, R30, zr	#@convI2L_reg_reg
0d8     loadV V2, [R7]	# vector (rvv)
0e0     vmul_regL V1, V1, R28

vmul_regL_masked Node

19c     slli  R30, R30, (#3 & 0x3f)	#@lShiftL_reg_imm
19e     spill [sp, #32] -> R31	# spill size = 64
1a0     add R31, R31, R30	# ptr, #@addP_reg_reg
1a2     addi  R10, R31, #16	# ptr, #@addP_reg_imm
1a6     loadV V1, [R10]	# vector (rvv)
1ae     vmul_regL_masked V1, V1, R11
1b6     spill [sp, #48] -> R10	# spill size = 64

[1] https://github.com/riscv/riscv-v-spec/blob/v1.0/v-spec.adoc
[2] https://github.com/openjdk/jdk/blob/master/test/jdk/jdk/incubator/vector/Byte256VectorTests.java

### Testing
- [x] test/jdk/jdk/incubator/vector (fastdebug) qemu 8.1.50 with UseRVV
- [ ] Run tier1-3 tests on SOPHON SG2042 (release)
- [ ] Run tier1-3 tests (release)  on qemu 8.1.50 with UseRVV

-------------

Commit messages:
 - 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions

Changes: https://git.openjdk.org/jdk/pull/19415/files
  Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19415&range=00
  Issue: https://bugs.openjdk.org/browse/JDK-8333006
  Stats: 253 lines in 2 files changed: 252 ins; 0 del; 1 mod
  Patch: https://git.openjdk.org/jdk/pull/19415.diff
  Fetch: git fetch https://git.openjdk.org/jdk.git pull/19415/head:pull/19415

PR: https://git.openjdk.org/jdk/pull/19415